From 2a1af0b880aa2e1eebb8e0d67ac7aff5d530befc Mon Sep 17 00:00:00 2001 From: Peter Rowlands Date: Mon, 21 Nov 2022 18:05:18 +0900 Subject: [PATCH] add: merge existing file meta for versioned dirs --- dvc/output.py | 22 ++++++++++++++++++++++ dvc/repo/add.py | 2 +- dvc/stage/__init__.py | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 4 deletions(-) diff --git a/dvc/output.py b/dvc/output.py index 37f763fe6a..00f471778c 100644 --- a/dvc/output.py +++ b/dvc/output.py @@ -1203,6 +1203,28 @@ def restore_fields(self, other: "Output"): self.remote = other.remote self.can_push = other.can_push + def merge_version_meta(self, other: "Output"): + """Merge version meta for files which are unchanged from other.""" + if not self.hash_info: + return + if self.hash_info.isdir: + return self._merge_dir_version_meta(other) + if self.hash_info != other.hash_info: + return + self.meta = other.meta + + def _merge_dir_version_meta(self, other: "Output"): + from dvc_data.hashfile.tree import update_meta + + if not self.obj or not other.hash_info.isdir: + return + other_obj = other.obj if other.obj is not None else other.get_obj() + assert isinstance(self.obj, Tree) and isinstance(other_obj, Tree) + updated = update_meta(self.obj, other_obj) + assert updated.hash_info == self.obj.hash_info + self.obj = updated + self.files = updated.as_list(with_meta=True) + META_SCHEMA = { Meta.PARAM_SIZE: int, diff --git a/dvc/repo/add.py b/dvc/repo/add.py index 3c28316d00..be341724c2 100644 --- a/dvc/repo/add.py +++ b/dvc/repo/add.py @@ -187,7 +187,7 @@ def add( # noqa: C901 stage.transfer(source, to_remote=to_remote, odb=odb, **kwargs) else: try: - stage.save() + stage.save(merge_versioned=True) if not no_commit: stage.commit() except CacheLinkError: diff --git a/dvc/stage/__init__.py b/dvc/stage/__init__.py index 1d47840e5c..6bdb5446c6 100644 --- a/dvc/stage/__init__.py +++ b/dvc/stage/__init__.py @@ -463,10 +463,12 @@ def compute_md5(self): logger.debug("Computed %s md5: '%s'", self, m) return m - def save(self, allow_missing=False): + def save(self, allow_missing: bool = False, merge_versioned: bool = False): self.save_deps(allow_missing=allow_missing) - self.save_outs(allow_missing=allow_missing) + self.save_outs( + allow_missing=allow_missing, merge_versioned=merge_versioned + ) self.md5 = self.compute_md5() self.repo.stage_cache.save(self) @@ -481,15 +483,40 @@ def save_deps(self, allow_missing=False): if not allow_missing: raise - def save_outs(self, allow_missing=False): + def save_outs( + self, allow_missing: bool = False, merge_versioned: bool = False + ): from dvc.output import OutputDoesNotExistError + from .exceptions import StageFileDoesNotExistError, StageNotFound + + if merge_versioned: + try: + old = self.reload() + old_outs = {out.def_path: out for out in old.outs} + merge_versioned = any( + ( + out.files is not None + or ( + out.meta is not None + and out.meta.version_id is not None + ) + ) + for out in old_outs.values() + ) + except (StageFileDoesNotExistError, StageNotFound): + merge_versioned = False + for out in self.outs: try: out.save() except OutputDoesNotExistError: if not (allow_missing or out.checkpoint): raise + if merge_versioned: + old_out = old_outs.get(out.def_path) + if old_out is not None: + out.merge_version_meta(old_out) def ignore_outs(self): for out in self.outs: