Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

worktree add: preserve version metadata for unmodified files on dvc add #8595

Merged
merged 2 commits into from
Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions dvc/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -1197,6 +1197,34 @@ def is_metric(self) -> bool:
def is_plot(self) -> bool:
return bool(self.plot)

def restore_fields(self, other: "Output"):
"""Restore attributes that need to be preserved when serialized."""
self.annot = other.annot
self.remote = other.remote
self.can_push = other.can_push

def merge_version_meta(self, other: "Output"):
"""Merge version meta for files which are unchanged from other."""
if not self.hash_info:
return
if self.hash_info.isdir:
return self._merge_dir_version_meta(other)
if self.hash_info != other.hash_info:
return
self.meta = other.meta

def _merge_dir_version_meta(self, other: "Output"):
from dvc_data.hashfile.tree import update_meta

if not self.obj or not other.hash_info.isdir:
return
other_obj = other.obj if other.obj is not None else other.get_obj()
assert isinstance(self.obj, Tree) and isinstance(other_obj, Tree)
updated = update_meta(self.obj, other_obj)
assert updated.hash_info == self.obj.hash_info
self.obj = updated
self.files = updated.as_list(with_meta=True)


META_SCHEMA = {
Meta.PARAM_SIZE: int,
Expand Down
2 changes: 1 addition & 1 deletion dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def add( # noqa: C901
stage.transfer(source, to_remote=to_remote, odb=odb, **kwargs)
else:
try:
stage.save()
stage.save(merge_versioned=True)
if not no_commit:
stage.commit()
except CacheLinkError:
Expand Down
42 changes: 34 additions & 8 deletions dvc/stage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,11 @@ def restore_fields(stage):
stage.meta = old.meta
stage.desc = old.desc

old_fields = {
out.def_path: (out.annot, out.remote, out.can_push) for out in old.outs
}
old_outs = {out.def_path: out for out in old.outs}
for out in stage.outs:
if out_fields := old_fields.get(out.def_path, None):
out.annot, out.remote, out.can_push = out_fields
old_out = old_outs.get(out.def_path, None)
if old_out is not None:
out.restore_fields(old_out)
Comment on lines +118 to +120
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Continually adding fields to these tuples was not ideal as far as maintaining it goes, is clearer/easier to update in the future by just keeping these assignments in Output



class Stage(params.StageParams):
Expand Down Expand Up @@ -464,10 +463,12 @@ def compute_md5(self):
logger.debug("Computed %s md5: '%s'", self, m)
return m

def save(self, allow_missing=False):
def save(self, allow_missing: bool = False, merge_versioned: bool = False):
self.save_deps(allow_missing=allow_missing)

self.save_outs(allow_missing=allow_missing)
self.save_outs(
allow_missing=allow_missing, merge_versioned=merge_versioned
)
self.md5 = self.compute_md5()

self.repo.stage_cache.save(self)
Expand All @@ -482,15 +483,40 @@ def save_deps(self, allow_missing=False):
if not allow_missing:
raise

def save_outs(self, allow_missing=False):
def save_outs(
self, allow_missing: bool = False, merge_versioned: bool = False
):
from dvc.output import OutputDoesNotExistError

from .exceptions import StageFileDoesNotExistError, StageNotFound

if merge_versioned:
try:
old = self.reload()
Comment on lines +493 to +495
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This extra reload on save might be expensive, so it's tied to the merge_versioned flag for now.

The problem is that we can't do this in the same place we are doing restore_fields. restore_fields is done on repo.stage.create and is intended to set fields that may be replaced/updated afterwards (i.e. we load original annotations from a dvcfile in restore_fields and then use out.annot.update() later to replace the loaded values with any new/updated ones from the command line.

But for the version metadata merge, we cannot do it until after the new outs have been save()'d, since we need the new out md5's to be computed before we can compare them to the old ones.

old_outs = {out.def_path: out for out in old.outs}
merge_versioned = any(
(
out.files is not None
or (
out.meta is not None
and out.meta.version_id is not None
)
)
for out in old_outs.values()
)
except (StageFileDoesNotExistError, StageNotFound):
merge_versioned = False

for out in self.outs:
try:
out.save()
except OutputDoesNotExistError:
if not (allow_missing or out.checkpoint):
raise
if merge_versioned:
old_out = old_outs.get(out.def_path)
if old_out is not None:
out.merge_version_meta(old_out)

def ignore_outs(self):
for out in self.outs:
Expand Down