Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

--include option #264

Merged
merged 1 commit into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ Additional optional arguments:
* ``--cache`` to use a cache other than the default of ``zstash``. If hpss is ``--hpss=none``, then this will be the archive.
* ``--exclude`` comma separated list of file patterns to exclude
* ``--follow-symlinks`` Hard copy symlinks. This is useful for preventing broken links. Note that a broken link will result in a failed create.
* ``--include`` comma separated list of file patterns to include
* ``--keep`` to keep a copy of the tar files on the local file system after
they have been transferred to HPSS. Normally, they are deleted after
successful transfer.
Expand Down Expand Up @@ -230,6 +231,7 @@ where
* ``--dry-run`` an optional argument to specify a dry run, only lists files to be updated in archive.
* ``--exclude`` an optional argument of comma separated list of file patterns to exclude
* ``--follow-symlinks`` Hard copy symlinks. This is useful for preventing broken links. Note that a broken link will result in a failed update.
* ``--include`` an optional argument of comma separated list of file patterns to include
* ``--keep`` to keep a copy of the tar files on the local file system after
they have been extracted from the archive. Normally, they are deleted after
successful transfer.
Expand Down
95 changes: 89 additions & 6 deletions tests/test_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ class TestCreate(TestZstash):
"""

# x = on, no mark = off, b = both on and off tested
# option | CreateVerbose | CreateExcludeDir | CreateExcludeFile | CreateKeep | CreateCache | TestZstash.create (used in multiple tests) | TestCheckParallel.testKeepTarsWithPreviouslySetHPSS |
# --exclude | |x|x| | | | |
# --maxsize | | | | | | |x|
# --keep | | | |x| |b| |
# --cache | | | | |x| | |
# -v |x| | | | | | |
# option | CreateVerbose | CreateIncludeDir | CreateIncludeFile | CreateExcludeDir | CreateExcludeFile | CreateKeep | CreateCache | TestZstash.create (used in multiple tests) | TestCheckParallel.testKeepTarsWithPreviouslySetHPSS |
# --exclude | | | |x|x| | | | |
# --include | |x|x| | | | | | |
# --maxsize | | | | | | | | |x|
# --keep | | | | | |x| |b| |
# --cache | | | | | | |x| | |
# -v |x| | | | | | | | |

def helperCreateVerbose(self, test_name, hpss_path: str, zstash_path=ZSTASH_PATH):
"""
Expand All @@ -34,6 +35,74 @@ def helperCreateVerbose(self, test_name, hpss_path: str, zstash_path=ZSTASH_PATH
use_hpss = self.setupDirs(test_name)
self.create(use_hpss, zstash_path, verbose=True)

def helperCreateIncludeDir(self, test_name, hpss_path, zstash_path=ZSTASH_PATH):
"""
Test `zstash --include`, including a directory.
"""
self.hpss_path = hpss_path
use_hpss = self.setupDirs(test_name)
if use_hpss:
description_str = "Adding files to HPSS"
else:
description_str = "Adding files to local archive"
print_starred(description_str)
self.assertWorkspace()
included_files = "dir/"
cmd = "{}zstash create --include={} --hpss={} {}".format(
zstash_path, included_files, self.hpss_path, self.test_dir
)
output, err = run_cmd(cmd)
expected_present = [
"Archiving dir/file1.txt",
]
if use_hpss:
expected_present += ["Transferring file to HPSS"]
else:
expected_present += ["put: HPSS is unavailable"]
expected_absent = [
"ERROR",
"Archiving file0.txt",
"Archiving file_empty.txt",
"Archiving file0_soft.txt",
"Archiving file0_soft_bad.txt",
"Archiving file0_hard.txt",
]
self.check_strings(cmd, output + err, expected_present, expected_absent)

def helperCreateIncludeFile(self, test_name, hpss_path, zstash_path=ZSTASH_PATH):
"""
Test `zstash --include`, including a file.
"""
self.hpss_path = hpss_path
use_hpss = self.setupDirs(test_name)
if use_hpss:
description_str = "Adding files to HPSS"
else:
description_str = "Adding files to local archive"
print_starred(description_str)
self.assertWorkspace()
included_files = "file0.txt,file_empty.txt"
cmd = "{}zstash create --include={} --hpss={} {}".format(
zstash_path, included_files, self.hpss_path, self.test_dir
)
output, err = run_cmd(cmd)
expected_present = [
"Archiving file0.txt",
"Archiving file_empty.txt",
]
if use_hpss:
expected_present += ["Transferring file to HPSS"]
else:
expected_present += ["put: HPSS is unavailable"]
expected_absent = [
"ERROR",
"Archiving dir/file1.txt",
"Archiving file0_soft.txt",
"Archiving file0_soft_bad.txt",
"Archiving file0_hard.txt",
]
self.check_strings(cmd, output + err, expected_present, expected_absent)

def writeExtraFiles(self):
"""
Write extra files for `zstash --exclude`.
Expand Down Expand Up @@ -147,6 +216,20 @@ def testCreateVerboseHPSS(self):
self.conditional_hpss_skip()
self.helperCreateVerbose("testCreateVerboseHPSS", HPSS_ARCHIVE)

def testCreateIncludeDir(self):
self.helperCreateIncludeDir("testCreateIncludeDir", "none")

def testCreateIncludeDirHPSS(self):
self.conditional_hpss_skip()
self.helperCreateIncludeDir("testCreateIncludeDir", HPSS_ARCHIVE)

def testCreateIncludeFile(self):
self.helperCreateIncludeFile("testCreateIncludeFile", "none")

def testCreateIncludeFileHPSS(self):
self.conditional_hpss_skip()
self.helperCreateIncludeFile("testCreateIncludeFile", HPSS_ARCHIVE)

# No need to include a with-HPSS version.
def testCreateExcludeDir(self):
self.helperCreateExcludeDir("testCreateExcludeDir", "none")
Expand Down
5 changes: 4 additions & 1 deletion zstash/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ def setup_create() -> Tuple[str, argparse.Namespace]:
optional: argparse._ArgumentGroup = parser.add_argument_group(
"optional named arguments"
)
optional.add_argument(
"--include", type=str, help="comma separated list of file patterns to include"
)
optional.add_argument(
"--exclude", type=str, help="comma separated list of file patterns to exclude"
)
Expand Down Expand Up @@ -237,7 +240,7 @@ def create_database(cache: str, args: argparse.Namespace) -> List[str]:
cur.execute("insert into config values (?,?)", (attr, value))
con.commit()

files: List[str] = get_files_to_archive(cache, args.exclude)
files: List[str] = get_files_to_archive(cache, args.include, args.exclude)

failures: List[str]
if args.follow_symlinks:
Expand Down
5 changes: 4 additions & 1 deletion zstash/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ def setup_update() -> Tuple[argparse.Namespace, str]:
"and NERSC HPSS endpoints, e.g. globus://nersc/~/my_archive."
),
)
optional.add_argument(
"--include", type=str, help="comma separated list of file patterns to include"
)
optional.add_argument(
"--exclude", type=str, help="comma separated list of file patterns to exclude"
)
Expand Down Expand Up @@ -173,7 +176,7 @@ def update_database( # noqa: C901
logger.debug("Max size : {}".format(maxsize))
logger.debug("Keep local tar files : {}".format(keep))

files: List[str] = get_files_to_archive(cache, args.exclude)
files: List[str] = get_files_to_archive(cache, args.include, args.exclude)

# Eliminate files that are already archived and up to date
newfiles: List[str] = []
Expand Down
45 changes: 30 additions & 15 deletions zstash/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,42 @@
from .settings import TupleTarsRow, config, logger


def exclude_files(exclude: str, files: List[str]) -> List[str]:
def filter_files(subset: str, files: List[str], include: bool) -> List[str]:

# Construct lits of files to exclude, based on
# Construct list of files to filter, based on
# https://codereview.stackexchange.com/questions/33624/
# filtering-a-long-list-of-files-through-a-set-of-ignore-patterns-using-iterators
exclude_patterns: List[str] = exclude.split(",")
subset_patterns: List[str] = subset.split(",")

# If exclude pattern ends with a trailing '/', the user intends to exclude
# If subset pattern ends with a trailing '/', the user intends to filter
# the entire subdirectory content, therefore replace '/' with '/*'
for i in range(len(exclude_patterns)):
if exclude_patterns[i][-1] == "/":
exclude_patterns[i] += "*"
for i in range(len(subset_patterns)):
if subset_patterns[i][-1] == "/":
subset_patterns[i] += "*"

# Actual files to exclude
exclude_files: List[str] = []
# Actual files to filter
subset_files: List[str] = []
for file_name in files:
if any(fnmatch(file_name, pattern) for pattern in exclude_patterns):
exclude_files.append(file_name)
if any(fnmatch(file_name, pattern) for pattern in subset_patterns):
subset_files.append(file_name)

# Now, remove those files
new_files = [f for f in files if f not in exclude_files]
# Now, filter those files
if include:
new_files = [f for f in files if f in subset_files]
else:
new_files = [f for f in files if f not in subset_files]

return new_files


def exclude_files(exclude: str, files: List[str]) -> List[str]:
return filter_files(exclude, files, include=False)


def include_files(include: str, files: List[str]) -> List[str]:
return filter_files(include, files, include=True)


def run_command(command: str, error_str: str):
p1: subprocess.Popen = subprocess.Popen(
shlex.split(command), stdout=subprocess.PIPE, stderr=subprocess.PIPE
Expand All @@ -55,7 +66,7 @@ def run_command(command: str, error_str: str):
raise RuntimeError(error_str)


def get_files_to_archive(cache: str, exclude: str) -> List[str]:
def get_files_to_archive(cache: str, include: str, exclude: str) -> List[str]:
# List of files
logger.info("Gathering list of files to archive")
# Tuples of the form (path, filename)
Expand All @@ -82,7 +93,11 @@ def get_files_to_archive(cache: str, exclude: str) -> List[str]:
if x[0] != os.path.join(".", cache)
]

# Eliminate files based on exclude pattern
# First, add files based on include pattern
if include is not None:
files = include_files(include, files)
Copy link
Collaborator Author

@forsyth2 forsyth2 May 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is sufficient to determine which files to add to the zstash archive after getting the entire list of files, as the archiving step (which comes later) is the computationally expensive part.


# Then, eliminate files based on exclude pattern
if exclude is not None:
files = exclude_files(exclude, files)

Expand Down