diff --git a/MergeConflict.md b/MergeConflict.md index a0764dc15..4a04609fb 100644 --- a/MergeConflict.md +++ b/MergeConflict.md @@ -15,7 +15,7 @@ $ oxen add test.csv $ oxen commit -m "adding a dog" # checkout a new branch where you are going to append a cat -$ oxen checkout -b "adding-cat" +$ oxen checkout -b adding-cat $ echo "images/cat.png,cat" >> test.csv $ oxen add test.csv $ oxen commit -m "adding a cat" diff --git a/src/cli/src/cmd_setup.rs b/src/cli/src/cmd_setup.rs index 552ff9937..06df9f73e 100644 --- a/src/cli/src/cmd_setup.rs +++ b/src/cli/src/cmd_setup.rs @@ -519,8 +519,8 @@ pub fn pull() -> Command<'static> { pub fn diff() -> Command<'static> { Command::new(DIFF) - .about("Compare file from a commit history") - .arg(Arg::new("FILE_OR_COMMIT_ID").required(true)) + .about("Compare two files against each other or against versions. The first parameter can be one of three things 1) another file 2) a commit hash 3) a branch name. If the first parameter is a committish it will compare the second parameter path to that version of the file.") + .arg(Arg::new("FILE_OR_COMMITTISH").required(true)) .arg(Arg::new("PATH").required(false)) } diff --git a/src/cli/src/parse_and_run.rs b/src/cli/src/parse_and_run.rs index ed7bd4fd1..b6c153885 100644 --- a/src/cli/src/parse_and_run.rs +++ b/src/cli/src/parse_and_run.rs @@ -623,7 +623,9 @@ pub async fn diff(sub_matches: &ArgMatches) { async fn p_diff(sub_matches: &ArgMatches, is_remote: bool) { // First arg is optional - let file_or_commit_id = sub_matches.value_of("FILE_OR_COMMIT_ID").expect("required"); + let file_or_commit_id = sub_matches + .value_of("FILE_OR_COMMITTISH") + .expect("required"); let path = sub_matches.value_of("PATH"); if let Some(path) = path { match dispatch::diff(Some(file_or_commit_id), path, is_remote).await { diff --git a/src/lib/src/command.rs b/src/lib/src/command.rs index 93c8e50cb..5e610be59 100644 --- a/src/lib/src/command.rs +++ b/src/lib/src/command.rs @@ -1187,14 +1187,65 @@ pub async fn pull(repo: &LocalRepository) -> Result<(), OxenError> { Ok(()) } -/// Diff a file from commit history +/// Diff a file from a commit or compared to another file +/// `resource` can be a None, commit id, branch name, or another path. +/// None: compare `path` to the last commit versioned of the file. If a merge conflict with compare to the merge conflict +/// commit id: compare `path` to the version of `path` from that commit +/// branch name: compare `path` to the version of `path` from that branch +/// another path: compare `path` to the other `path` provided +/// `path` is the path you want to compare the resource to pub fn diff( repo: &LocalRepository, - commit_id_or_branch: Option<&str>, - path: &Path, + resource: Option<&str>, + path: impl AsRef, ) -> Result { - let commit = resource::get_commit_or_head(repo, commit_id_or_branch)?; - differ::diff(repo, Some(&commit.id), path) + if let Some(resource) = resource { + // `resource` is Some(resource) + if let Some(compare_commit) = api::local::commits::get_by_id(repo, resource)? { + // `resource` is a commit id + let original_commit = head_commit(repo)?; + differ::diff(repo, &original_commit, &compare_commit, path) + } else if let Some(branch) = api::local::branches::get_by_name(repo, resource)? { + // `resource` is a branch name + let compare_commit = api::local::commits::get_by_id(repo, &branch.commit_id)?.unwrap(); + let original_commit = head_commit(repo)?; + + differ::diff(repo, &original_commit, &compare_commit, path) + } else if Path::new(resource).exists() { + // `resource` is another path + differ::diff_files(resource, path) + } else { + Err(OxenError::basic_str(format!( + "Could not find resource: {resource:?}" + ))) + } + } else { + // `resource` is None + // First check if there are merge conflicts + let merger = MergeConflictReader::new(repo)?; + if merger.has_conflicts()? { + match merger.get_conflict_commit() { + Ok(Some(commit)) => { + let current_path = path.as_ref(); + let version_path = + differ::get_version_file_from_commit(repo, &commit, current_path)?; + differ::diff_files(current_path, version_path) + } + err => { + log::error!("{err:?}"); + Err(OxenError::basic_str(format!( + "Could not find merge resource: {resource:?}" + ))) + } + } + } else { + // No merge conflicts, compare to last version committed of the file + let current_path = path.as_ref(); + let commit = head_commit(repo)?; + let version_path = differ::get_version_file_from_commit(repo, &commit, current_path)?; + differ::diff_files(version_path, current_path) + } + } } pub async fn remote_diff( diff --git a/src/lib/src/error.rs b/src/lib/src/error.rs index 756625f2f..50f9574cf 100644 --- a/src/lib/src/error.rs +++ b/src/lib/src/error.rs @@ -173,6 +173,11 @@ impl OxenError { OxenError::basic_str(err) } + pub fn file_has_no_file_name>(path: T) -> OxenError { + let err = format!("File has no file_name: {:?}", path.as_ref()); + OxenError::basic_str(err) + } + pub fn could_not_convert_path_to_str>(path: T) -> OxenError { let err = format!("File has no name: {:?}", path.as_ref()); OxenError::basic_str(err) diff --git a/src/lib/src/index/differ.rs b/src/lib/src/index/differ.rs index 97c8bbbb4..422b24a40 100644 --- a/src/lib/src/index/differ.rs +++ b/src/lib/src/index/differ.rs @@ -1,6 +1,6 @@ use crate::df::{tabular, DFOpts}; use crate::error::OxenError; -use crate::index::{CommitDirEntryReader, CommitReader}; +use crate::index::CommitDirEntryReader; use crate::model::entry::diff_entry::DiffEntryStatus; use crate::model::{Commit, CommitEntry, DataFrameDiff, DiffEntry, LocalRepository, Schema}; use crate::{constants, util}; @@ -11,69 +11,77 @@ use polars::export::ahash::HashMap; use polars::prelude::DataFrame; use polars::prelude::IntoLazy; use std::collections::HashSet; -use std::path::Path; +use std::path::{Path, PathBuf}; -use super::{CommitDirReader, SchemaReader}; +use super::CommitDirReader; pub fn diff( repo: &LocalRepository, - commit_id: Option<&str>, - path: &Path, + original: &Commit, + compare: &Commit, + path: impl AsRef, ) -> Result { - match _commit_or_head(repo, commit_id)? { - Some(commit) => _diff_commit(repo, &commit, path), - None => Err(OxenError::commit_id_does_not_exist(commit_id.unwrap())), - } + let original_path = get_version_file_from_commit(repo, original, &path)?; + let compare_path = get_version_file_from_commit(repo, compare, &path)?; + diff_files(original_path, compare_path) } -fn _commit_or_head( +pub fn get_version_file_from_commit( repo: &LocalRepository, - commit_id: Option<&str>, -) -> Result, OxenError> { - let commit_reader = CommitReader::new(repo)?; - if let Some(commit_id) = commit_id { - commit_reader.get_commit_by_id(commit_id) - } else { - Ok(Some(commit_reader.head_commit()?)) - } + commit: &Commit, + path: impl AsRef, +) -> Result { + let path = path.as_ref(); + + // Extract parent so we can use it to instantiate CommitDirEntryReader + let parent = match path.parent() { + Some(parent) => parent, + None => return Err(OxenError::file_has_no_parent(path)), + }; + + // Instantiate CommitDirEntryReader to fetch entry + let relative_parent = util::fs::path_relative_to_dir(parent, &repo.path)?; + let commit_entry_reader = CommitDirEntryReader::new(repo, &commit.id, &relative_parent)?; + let file_name = match path.file_name() { + Some(file_name) => file_name, + None => return Err(OxenError::file_has_no_file_name(path)), + }; + + // Get entry from the reader + let entry = match commit_entry_reader.get_entry(file_name) { + Ok(Some(entry)) => entry, + _ => return Err(OxenError::file_does_not_exist_in_commit(path, &commit.id)), + }; + + Ok(util::fs::version_path(repo, &entry)) } -// TODO: Change API to take two commits -fn _diff_commit(repo: &LocalRepository, commit: &Commit, path: &Path) -> Result { - if let Some(parent) = path.parent() { - let relative_parent = util::fs::path_relative_to_dir(parent, &repo.path)?; - let commit_entry_reader = CommitDirEntryReader::new(repo, &commit.id, &relative_parent)?; - let file_name = path.file_name().unwrap(); - if let Ok(Some(entry)) = commit_entry_reader.get_entry(file_name) { - if util::fs::is_tabular(path) { - let commit_reader = CommitReader::new(repo)?; - let commits = commit_reader.history_from_head()?; - - let current_commit = commits.first().unwrap(); - - return diff_tabular(repo, current_commit, &entry.path); - } else if util::fs::is_utf8(path) { - // TODO: Change API to take two commits - return diff_utf8(repo, &entry); - } - Err(OxenError::basic_str(format!( - "Diff not supported for file: {path:?}" - ))) - } else { - Err(OxenError::file_does_not_exist_in_commit(path, &commit.id)) - } - } else { - Err(OxenError::file_has_no_parent(path)) +pub fn diff_files( + original: impl AsRef, + compare: impl AsRef, +) -> Result { + let original = original.as_ref(); + let compare = compare.as_ref(); + if util::fs::is_tabular(original) && util::fs::is_tabular(compare) { + let tabular_diff = diff_tabular(original, compare)?; + return Ok(tabular_diff.to_string()); + } else if util::fs::is_utf8(original) && util::fs::is_utf8(compare) { + return diff_utf8(original, compare); } + Err(OxenError::basic_str(format!( + "Diff not supported for files: {original:?} and {compare:?}" + ))) } -pub fn diff_utf8(repo: &LocalRepository, entry: &CommitEntry) -> Result { - let current_path = repo.path.join(&entry.path); - let version_path = util::fs::version_path(repo, entry); - - let original = util::fs::read_from_path(&version_path)?; - let modified = util::fs::read_from_path(¤t_path)?; - let Changeset { diffs, .. } = Changeset::new(&original, &modified, "\n"); +pub fn diff_utf8( + original: impl AsRef, + compare: impl AsRef, +) -> Result { + let original = original.as_ref(); + let compare = compare.as_ref(); + let original_data = util::fs::read_from_path(original)?; + let compare_data = util::fs::read_from_path(compare)?; + let Changeset { diffs, .. } = Changeset::new(&original_data, &compare_data, "\n"); let mut outputs: Vec = vec![]; for diff in diffs { @@ -100,62 +108,57 @@ pub fn diff_utf8(repo: &LocalRepository, entry: &CommitEntry) -> Result Result { - let schema_reader = SchemaReader::new(repo, &commit.id)?; - if let Some(schema) = schema_reader.get_schema_for_file(path)? { - let diff = compute_dataframe_diff(repo, commit, &schema, path)?; - Ok(diff.to_string()) - } else { - Err(OxenError::schema_does_not_exist_for_file(path)) + original_path: impl AsRef, + compare_path: impl AsRef, +) -> Result { + let original_path = original_path.as_ref(); + let compare_path = compare_path.as_ref(); + // Make sure files exist + if !original_path.exists() { + return Err(OxenError::file_does_not_exist(original_path)); } -} -fn compute_dataframe_diff( - repo: &LocalRepository, - commit: &Commit, - versioned_schema: &Schema, - path: &Path, -) -> Result { - let commit_entry_reader = CommitDirEntryReader::new(repo, &commit.id, path.parent().unwrap())?; - let filename = Path::new(path.file_name().unwrap().to_str().unwrap()); - if let Some(entry) = commit_entry_reader.get_entry(filename)? { - // Read current DF and get schema - let current_path = repo.path.join(path); - let versioned_path = util::fs::version_path(repo, &entry); - let current_df = tabular::read_df(¤t_path, DFOpts::empty())?; - let current_schema = Schema::from_polars(¤t_df.schema()); - - // If schemas don't match, figure out which columns are different - if versioned_schema.hash != current_schema.hash { - compute_new_columns( - ¤t_path, - &versioned_path, - ¤t_schema, - versioned_schema, - ) - } else { - println!("Computing diff for {path:?}"); - // Schemas match, find added and removed rows - // Read versioned df - let versioned_df = tabular::read_df(&versioned_path, DFOpts::empty())?; - compute_new_rows(current_df, versioned_df, versioned_schema) - } + if !compare_path.exists() { + return Err(OxenError::file_does_not_exist(compare_path)); + } + + // Read DFs and get schemas + let original_df = tabular::read_df(original_path, DFOpts::empty())?; + let compare_df = tabular::read_df(compare_path, DFOpts::empty())?; + let original_schema = Schema::from_polars(&original_df.schema()); + let compare_schema = Schema::from_polars(&compare_df.schema()); + + log::debug!( + "Original df {} {original_path:?}\n{original_df:?}", + original_schema.hash + ); + log::debug!( + "Compare df {} {compare_path:?}\n{compare_df:?}", + compare_schema.hash + ); + + // If schemas don't match, figure out which columns are different + if original_schema.hash != compare_schema.hash { + compute_new_columns( + original_path, + compare_path, + &original_schema, + &compare_schema, + ) } else { - Err(OxenError::file_does_not_exist(path)) + log::debug!("Computing diff for {original_path:?} to {compare_path:?}"); + compute_new_rows(original_df, compare_df, &original_schema) } } fn compute_new_rows( - current_df: DataFrame, - versioned_df: DataFrame, - versioned_schema: &Schema, + original_df: DataFrame, + compare_df: DataFrame, + schema: &Schema, ) -> Result { // Hash the rows - let versioned_df = tabular::df_hash_rows(versioned_df)?; - let current_df = tabular::df_hash_rows(current_df)?; + let versioned_df = tabular::df_hash_rows(original_df)?; + let current_df = tabular::df_hash_rows(compare_df)?; // log::debug!("diff_current got current hashes {}", current_df); @@ -200,17 +203,17 @@ fn compute_new_rows( // log::debug!("diff_current removed_indices {:?}", removed_indices); // Take added from the current df - let opts = DFOpts::from_schema_columns(versioned_schema); + let opts = DFOpts::from_schema_columns(schema); let current_df = tabular::transform(current_df, opts)?; let added_rows = tabular::take(current_df.lazy(), added_indices)?; // Take removed from versioned df - let opts = DFOpts::from_schema_columns(versioned_schema); + let opts = DFOpts::from_schema_columns(schema); let versioned_df = tabular::transform(versioned_df, opts)?; let removed_rows = tabular::take(versioned_df.lazy(), removed_indices)?; Ok(DataFrameDiff { - base_schema: versioned_schema.to_owned(), + base_schema: schema.to_owned(), added_rows: if added_rows.height() > 0 { Some(added_rows) } else { @@ -227,10 +230,10 @@ fn compute_new_rows( } fn compute_new_columns( - current_path: &Path, versioned_path: &Path, - current_schema: &Schema, + current_path: &Path, versioned_schema: &Schema, + current_schema: &Schema, ) -> Result { let added_fields = current_schema.added_fields(versioned_schema); let removed_fields = current_schema.removed_fields(versioned_schema); diff --git a/src/lib/src/index/merge_conflict_reader.rs b/src/lib/src/index/merge_conflict_reader.rs index abe1a52ff..4f10f7e34 100644 --- a/src/lib/src/index/merge_conflict_reader.rs +++ b/src/lib/src/index/merge_conflict_reader.rs @@ -1,15 +1,18 @@ -use crate::constants::MERGE_DIR; +use crate::constants::{MERGE_DIR, MERGE_HEAD_FILE}; use crate::db; use crate::error::OxenError; use crate::index::MergeConflictDBReader; -use crate::model::{LocalRepository, MergeConflict}; +use crate::model::{Commit, LocalRepository, MergeConflict}; use crate::util; use rocksdb::DB; use std::path::Path; +use super::CommitReader; + pub struct MergeConflictReader { merge_db: DB, + repository: LocalRepository, } impl MergeConflictReader { @@ -26,9 +29,18 @@ impl MergeConflictReader { Ok(MergeConflictReader { merge_db: DB::open_for_read_only(&opts, dunce::simplified(&db_path), false)?, + repository: repo.clone(), }) } + pub fn get_conflict_commit(&self) -> Result, OxenError> { + let hidden_dir = util::fs::oxen_hidden_dir(&self.repository.path); + let merge_head_path = hidden_dir.join(MERGE_HEAD_FILE); + let commit_id = util::fs::read_first_line(merge_head_path)?; + let commit_reader = CommitReader::new(&self.repository)?; + commit_reader.get_commit_by_id(commit_id) + } + pub fn has_conflicts(&self) -> Result { MergeConflictDBReader::has_conflicts(&self.merge_db) } diff --git a/src/lib/src/index/merger.rs b/src/lib/src/index/merger.rs index 12a4bbf67..6393245ad 100644 --- a/src/lib/src/index/merger.rs +++ b/src/lib/src/index/merger.rs @@ -876,7 +876,7 @@ mod tests { } #[tokio::test] - async fn test_has_merge_conflicts_without_merging() -> Result<(), OxenError> { + async fn test_merger_has_merge_conflicts_without_merging() -> Result<(), OxenError> { test::run_empty_local_repo_test_async(|repo| async move { // This case for a three way merge was failing, if one branch gets fast forwarded, then the next // should have a conflict from the LCA diff --git a/tests/integration_test.rs b/tests/integration_test.rs index a69cae9d6..06abab3fa 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -2193,7 +2193,82 @@ async fn test_can_add_merge_conflict() -> Result<(), OxenError> { .await } -// TODO: Test what should happen on a diff during a merge conflict for a dataframe +// Test diff during a merge conflict should show conflicts for a dataframe +#[tokio::test] +async fn test_has_merge_conflicts_without_merging() -> Result<(), OxenError> { + test::run_empty_local_repo_test_async(|repo| async move { + let og_branch = command::current_branch(&repo)?.unwrap(); + let data_path = repo.path.join("data.csv"); + util::fs::write_to_path(&data_path, "file,label\nimages/0.png,dog\n")?; + command::add(&repo, &data_path)?; + command::commit(&repo, "Add initial data.csv file with dog")?; + + // Add a fish label to the file on a branch + let fish_branch_name = "add-fish-label"; + command::create_checkout_branch(&repo, fish_branch_name)?; + let data_path = test::append_line_txt_file(data_path, "images/fish.png,fish\n")?; + command::add(&repo, &data_path)?; + command::commit(&repo, "Adding fish to data.csv file")?; + + // Checkout main, and branch from it to another branch to add a cat label + command::checkout(&repo, &og_branch.name).await?; + let cat_branch_name = "add-cat-label"; + command::create_checkout_branch(&repo, cat_branch_name)?; + let data_path = test::append_line_txt_file(data_path, "images/cat.png,cat\n")?; + command::add(&repo, &data_path)?; + command::commit(&repo, "Adding cat to data.csv file")?; + + // Checkout main again + command::checkout(&repo, &og_branch.name).await?; + + // Merge the fish branch in + let result = command::merge(&repo, fish_branch_name)?; + assert!(result.is_some()); + + // And then the cat branch should have conflicts + let result = command::merge(&repo, cat_branch_name)?; + assert!(result.is_none()); + + // Make sure we can access the conflicts in the status command + let status = command::status(&repo)?; + assert_eq!(status.merge_conflicts.len(), 1); + + // Get the diff dataframe + let diff = command::diff(&repo, None, &data_path)?; + log::debug!("{diff:?}"); + + assert_eq!( + diff, + r"Added Rows + +shape: (1, 2) +┌────────────────┬───────┐ +│ file ┆ label │ +│ --- ┆ --- │ +│ str ┆ str │ +╞════════════════╪═══════╡ +│ images/cat.png ┆ cat │ +└────────────────┴───────┘ + + +Removed Rows + +shape: (1, 2) +┌─────────────────┬───────┐ +│ file ┆ label │ +│ --- ┆ --- │ +│ str ┆ str │ +╞═════════════════╪═══════╡ +│ images/fish.png ┆ fish │ +└─────────────────┴───────┘ + +" + ); + + Ok(()) + }) + .await +} #[tokio::test] async fn test_commit_after_merge_conflict() -> Result<(), OxenError> { @@ -2815,7 +2890,7 @@ fn test_diff_tabular_add_col() -> Result<(), OxenError> { let bbox_filename = Path::new("annotations") .join("train") .join("bounding_box.csv"); - let bbox_file = repo.path.join(&bbox_filename); + let bbox_file = repo.path.join(bbox_filename); let mut opts = DFOpts::empty(); // Add Column @@ -2823,9 +2898,11 @@ fn test_diff_tabular_add_col() -> Result<(), OxenError> { // Save to Output opts.output = Some(bbox_file.clone()); // Perform df transform - command::df(bbox_file, opts)?; + command::df(&bbox_file, opts)?; + + let diff = command::diff(&repo, None, &bbox_file); + println!("{:?}", diff); - let diff = command::diff(&repo, None, &bbox_filename); assert!(diff.is_ok()); let diff = diff.unwrap(); assert_eq!( @@ -2859,7 +2936,7 @@ fn test_diff_tabular_add_row() -> Result<(), OxenError> { let bbox_filename = Path::new("annotations") .join("train") .join("bounding_box.csv"); - let bbox_file = repo.path.join(&bbox_filename); + let bbox_file = repo.path.join(bbox_filename); let mut opts = DFOpts::empty(); // Add Row @@ -2868,9 +2945,9 @@ fn test_diff_tabular_add_row() -> Result<(), OxenError> { // Save to Output opts.output = Some(bbox_file.clone()); // Perform df transform - command::df(bbox_file, opts)?; + command::df(&bbox_file, opts)?; - match command::diff(&repo, None, &bbox_filename) { + match command::diff(&repo, None, &bbox_file) { Ok(diff) => { println!("{diff}"); @@ -2905,10 +2982,10 @@ fn test_diff_tabular_remove_row() -> Result<(), OxenError> { let bbox_filename = Path::new("annotations") .join("train") .join("bounding_box.csv"); - let bbox_file = repo.path.join(&bbox_filename); + let bbox_file = repo.path.join(bbox_filename); // Remove a row - test::modify_txt_file( + let bbox_file = test::modify_txt_file( bbox_file, r" file,label,min_x,min_y,width,height @@ -2918,7 +2995,7 @@ train/cat_2.jpg,cat,30.5,44.0,333,396 ", )?; - match command::diff(&repo, None, &bbox_filename) { + match command::diff(&repo, None, bbox_file) { Ok(diff) => { println!("{diff}");