Skip to content

Commit

Permalink
Optimize dvcignore (#4242)
Browse files Browse the repository at this point in the history
* Update

1. unit test update
2. two might cause bugs

* dvc ignore

* Some refactoring

* clean

* Config tree use

* Solve sub dir repo problem. Add more tests

* typo error

* Change request

Co-authored-by: karajan1001 <[email protected]>
karajan1001 and karajan1001 authored Jul 22, 2020
1 parent a4b3c6d commit 67896aa
Showing 17 changed files with 171 additions and 209 deletions.
2 changes: 1 addition & 1 deletion dvc/config.py
Original file line number Diff line number Diff line change
@@ -316,7 +316,7 @@ def _load_config(self, level):
filename = self.files[level]
tree = self.tree if level == "repo" else self.wtree

if tree.exists(filename):
if tree.exists(filename, use_dvcignore=False):
with tree.open(filename) as fobj:
conf_obj = configobj.ConfigObj(fobj)
else:
180 changes: 73 additions & 107 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
@@ -99,67 +99,6 @@ def __bool__(self):
return bool(self.pattern_list)


class DvcIgnorePatternsTrie(DvcIgnore):
trie = None

def __init__(self):
if self.trie is None:
self.trie = StringTrie(separator=os.sep)

def __call__(self, root, dirs, files):
ignore_pattern = self[root]
if ignore_pattern:
return ignore_pattern(root, dirs, files)
return dirs, files

def __setitem__(self, root, ignore_pattern):
base_pattern = self[root]
common_dirname, merged_pattern = merge_patterns(
base_pattern.dirname,
base_pattern.pattern_list,
ignore_pattern.dirname,
ignore_pattern.pattern_list,
)
self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname)

def __getitem__(self, root):
ignore_pattern = self.trie.longest_prefix(root)
if ignore_pattern:
return ignore_pattern.value
return DvcIgnorePatterns([], root)


class DvcIgnoreDirs(DvcIgnore):
def __init__(self, basenames):
self.basenames = set(basenames)

def __call__(self, root, dirs, files):
dirs = [d for d in dirs if d not in self.basenames]

return dirs, files

def __hash__(self):
return hash(tuple(self.basenames))

def __eq__(self, other):
if not isinstance(other, DvcIgnoreDirs):
return NotImplemented

return self.basenames == other.basenames


class DvcIgnoreRepo(DvcIgnore):
def __call__(self, root, dirs, files):
def is_dvc_repo(directory):
from dvc.repo import Repo

return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

dirs = [d for d in dirs if not is_dvc_repo(d)]

return dirs, files


class DvcIgnoreFilterNoop:
def __init__(self, tree, root_dir):
pass
@@ -175,61 +114,99 @@ def is_ignored_file(self, _):


class DvcIgnoreFilter:
@staticmethod
def _is_dvc_repo(root, directory):
from dvc.repo import Repo

return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR))

def __init__(self, tree, root_dir):
from dvc.repo import Repo

default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)]

self.tree = tree
self.root_dir = root_dir
self.ignores = {
DvcIgnoreDirs([".git", ".hg", ".dvc"]),
DvcIgnoreRepo(),
}
ignore_pattern_trie = DvcIgnorePatternsTrie()
self.ignores_trie_tree = StringTrie(separator=os.sep)
self.ignores_trie_tree[root_dir] = DvcIgnorePatterns(
default_ignore_patterns, root_dir
)
for root, dirs, _ in self.tree.walk(self.root_dir):
ignore_pattern = self._get_ignore_pattern(root)
if ignore_pattern:
ignore_pattern_trie[root] = ignore_pattern
self.ignores.add(ignore_pattern_trie)
self._update(root)
self._update_sub_repo(root, dirs)
dirs[:], _ = self(root, dirs, [])

def _get_ignore_pattern(self, dirname):
def _update(self, dirname):
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
if self.tree.exists(ignore_file_path):
return DvcIgnorePatterns.from_files(ignore_file_path, self.tree)
return None
new_pattern = DvcIgnorePatterns.from_files(
ignore_file_path, self.tree
)
old_pattern = self._get_trie_pattern(dirname)
if old_pattern:
self.ignores_trie_tree[dirname] = DvcIgnorePatterns(
*merge_patterns(
old_pattern.pattern_list,
old_pattern.dirname,
new_pattern.pattern_list,
new_pattern.dirname,
)
)
else:
self.ignores_trie_tree[dirname] = new_pattern

def _update_sub_repo(self, root, dirs):
for d in dirs:
if self._is_dvc_repo(root, d):
old_pattern = self._get_trie_pattern(root)
if old_pattern:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
*merge_patterns(
old_pattern.pattern_list,
old_pattern.dirname,
["/{}/".format(d)],
root,
)
)
else:
self.ignores_trie_tree[root] = DvcIgnorePatterns(
["/{}/".format(d)], root
)

def __call__(self, root, dirs, files):
for ignore in self.ignores:
dirs, files = ignore(root, dirs, files)
ignore_pattern = self._get_trie_pattern(root)
if ignore_pattern:
return ignore_pattern(root, dirs, files)
else:
return dirs, files

return dirs, files
def _get_trie_pattern(self, dirname):
ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value
return ignore_pattern

def is_ignored_dir(self, path):
if not self._parents_exist(path):
def _is_ignored(self, path, is_dir=False):
if self._outside_repo(path):
return True
dirname, basename = os.path.split(os.path.normpath(path))
ignore_pattern = self._get_trie_pattern(dirname)
if ignore_pattern:
return ignore_pattern.matches(dirname, basename, is_dir)
else:
return False

def is_ignored_dir(self, path):
path = os.path.abspath(path)
if path == self.root_dir:
return False
dirname, basename = os.path.split(path)
dirs, _ = self(dirname, [basename], [])
return not dirs

def is_ignored_file(self, path):
if not self._parents_exist(path):
return True

dirname, basename = os.path.split(os.path.normpath(path))
_, files = self(os.path.abspath(dirname), [], [basename])
return not files
return self._is_ignored(path, True)

def _parents_exist(self, path):
from dvc.repo import Repo
def is_ignored_file(self, path):
return self._is_ignored(path, False)

def _outside_repo(self, path):
path = PathInfo(path)

# if parent is root_dir or inside a .dvc dir we can skip this check
if path.parent == self.root_dir or Repo.DVC_DIR in path.parts:
return True

# paths outside of the repo should be ignored
path = relpath(path, self.root_dir)
if path.startswith("..") or (
@@ -238,16 +215,5 @@ def _parents_exist(self, path):
[os.path.abspath(path), self.root_dir]
)
):
return False

# check if parent directories are in our ignores, starting from
# root_dir
for parent_dir in reversed(PathInfo(path).parents):
dirname, basename = os.path.split(parent_dir)
if basename == ".":
# parent_dir == root_dir
continue
dirs, _ = self(os.path.abspath(dirname), [basename], [])
if not dirs:
return False
return True
return True
return False
14 changes: 8 additions & 6 deletions dvc/pathspec_math.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,8 @@

from pathspec.util import normalize_file

from dvc.utils import relpath


def _not_ignore(rule):
return (True, rule[1:]) if rule.startswith("!") else (False, rule)
@@ -53,14 +55,14 @@ def change_rule(rule, rel):
def _change_dirname(dirname, pattern_list, new_dirname):
if new_dirname == dirname:
return pattern_list
rel = os.path.relpath(dirname, new_dirname)
rel = relpath(dirname, new_dirname)
if rel.startswith(".."):
raise ValueError("change dirname can only change to parent path")

return [change_rule(rule, rel) for rule in pattern_list]


def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
def merge_patterns(pattern_a, prefix_a, pattern_b, prefix_b):
"""
Merge two path specification patterns.
@@ -69,17 +71,17 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b):
based on this new base directory.
"""
if not pattern_a:
return prefix_b, pattern_b
return pattern_b, prefix_b
elif not pattern_b:
return prefix_a, pattern_a
return pattern_a, prefix_a

longest_common_dir = os.path.commonpath([prefix_a, prefix_b])
new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir)
new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir)

if len(prefix_a) < len(prefix_b):
if len(prefix_a) <= len(prefix_b):
merged_pattern = new_pattern_a + new_pattern_b
else:
merged_pattern = new_pattern_b + new_pattern_a

return longest_common_dir, merged_pattern
return merged_pattern, longest_common_dir
4 changes: 3 additions & 1 deletion dvc/repo/tree.py
Original file line number Diff line number Diff line change
@@ -277,7 +277,9 @@ def open(
)
return self.repo.tree.open(path, mode=mode, encoding=encoding)

def exists(self, path): # pylint: disable=arguments-differ
def exists(
self, path, use_dvcignore=True
): # pylint: disable=arguments-differ
return self.repo.tree.exists(path) or (
self.dvctree and self.dvctree.exists(path)
)
2 changes: 1 addition & 1 deletion dvc/tree/azure.py
Original file line number Diff line number Diff line change
@@ -101,7 +101,7 @@ def _generate_download_url(self, path_info, expires=3600):
)
return download_url

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
paths = self._list_paths(path_info.bucket, path_info.path)
return any(path_info.path == path for path in paths)

2 changes: 1 addition & 1 deletion dvc/tree/base.py
Original file line number Diff line number Diff line change
@@ -156,7 +156,7 @@ def open(self, path_info, mode="r", encoding=None):

raise RemoteActionNotImplemented("open", self.scheme)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
raise NotImplementedError

# pylint: disable=unused-argument
2 changes: 1 addition & 1 deletion dvc/tree/gdrive.py
Original file line number Diff line number Diff line change
@@ -522,7 +522,7 @@ def _get_item_id(self, path_info, create=False, use_cache=True, hint=None):
assert not create
raise FileMissingError(path_info, hint)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
try:
self._get_item_id(path_info)
except FileMissingError:
6 changes: 5 additions & 1 deletion dvc/tree/git.py
Original file line number Diff line number Diff line change
@@ -76,9 +76,13 @@ def open(
return io.BytesIO(data)
return io.StringIO(data.decode(encoding))

def exists(self, path): # pylint: disable=arguments-differ
def exists(
self, path, use_dvcignore=True
): # pylint: disable=arguments-differ
if self._git_object_by_path(path) is None:
return False
if not use_dvcignore:
return True

return not self.dvcignore.is_ignored_file(
path
2 changes: 1 addition & 1 deletion dvc/tree/gs.py
Original file line number Diff line number Diff line change
@@ -119,7 +119,7 @@ def _generate_download_url(self, path_info, expires=3600):
)
return signing_credentials.signer.sign(blob)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
"""Check if the blob exists. If it does not exist,
it could be a part of a directory path.
2 changes: 1 addition & 1 deletion dvc/tree/hdfs.py
Original file line number Diff line number Diff line change
@@ -72,7 +72,7 @@ def open(self, path_info, mode="r", encoding=None):
raise FileNotFoundError(*e.args)
raise

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
assert not isinstance(path_info, list)
assert path_info.scheme == "hdfs"
with self.hdfs(path_info) as hdfs:
2 changes: 1 addition & 1 deletion dvc/tree/http.py
Original file line number Diff line number Diff line change
@@ -122,7 +122,7 @@ def request(self, method, url, **kwargs):
except requests.exceptions.RequestException:
raise DvcException(f"could not perform a {method} request")

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
return bool(self.request("HEAD", path_info.url))

def get_file_hash(self, path_info):
4 changes: 3 additions & 1 deletion dvc/tree/local.py
Original file line number Diff line number Diff line change
@@ -69,14 +69,16 @@ def dvcignore(self):
def open(path_info, mode="r", encoding=None):
return open(path_info, mode=mode, encoding=encoding)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
assert isinstance(path_info, str) or path_info.scheme == "local"
if self.repo:
ret = os.path.lexists(path_info)
else:
ret = os.path.exists(path_info)
if not ret:
return False
if not use_dvcignore:
return True

return not self.dvcignore.is_ignored_file(
path_info
2 changes: 1 addition & 1 deletion dvc/tree/oss.py
Original file line number Diff line number Diff line change
@@ -88,7 +88,7 @@ def _generate_download_url(self, path_info, expires=3600):

return self.oss_service.sign_url("GET", path_info.path, expires)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
paths = self._list_paths(path_info)
return any(path_info.path == path for path in paths)

2 changes: 1 addition & 1 deletion dvc/tree/s3.py
Original file line number Diff line number Diff line change
@@ -131,7 +131,7 @@ def _generate_download_url(self, path_info, expires=3600):
ClientMethod="get_object", Params=params, ExpiresIn=int(expires)
)

def exists(self, path_info):
def exists(self, path_info, use_dvcignore=True):
"""Check if the blob exists. If it does not exist,
it could be a part of a directory path.
Loading

0 comments on commit 67896aa

Please sign in to comment.