From b9f1609df920ee718237a4426d447111a850fa8e Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Wed, 24 Apr 2024 08:09:02 -0400 Subject: [PATCH] Handle WARC filename conflicts with wb-manager add (#902) Append -index to end of filename prior to extension until there is no conflict Also makes sure this behavior is documented in tests --- pywb/manager/manager.py | 36 ++++++++++++++++++++++++++---------- tests/test_manager.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 10 deletions(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 1b833256..4a167cd6 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -7,6 +7,7 @@ import re import gzip import six +import pathlib from distutils.util import strtobool from pkg_resources import resource_string, get_distribution @@ -147,18 +148,32 @@ def add_archives(self, archives, unpack_wacz=False): if invalid_archives: logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') + def _rename_warc(self, warc_basename): + dupe_idx = 1 + ext = ''.join(pathlib.Path(warc_basename).suffixes) + pre_ext_name = warc_basename.split(ext)[0] + + while True: + new_basename = f'{pre_ext_name}-{dupe_idx}{ext}' + if not os.path.exists(os.path.join(self.archive_dir, new_basename)): + break + dupe_idx += 1 + + return new_basename + def _add_warc(self, warc): - filename = os.path.abspath(warc) + warc_source = os.path.abspath(warc) + source_dir, warc_basename = os.path.split(warc_source) # don't overwrite existing warcs with duplicate names - if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): - logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') - return None + if os.path.exists(os.path.join(self.archive_dir, warc_basename)): + warc_basename = self._rename_warc(warc_basename) + logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.') - shutil.copy2(filename, self.archive_dir) - full_path = os.path.join(self.archive_dir, filename) - logging.info('Copied ' + filename + ' to ' + self.archive_dir) - return full_path + warc_dest = os.path.join(self.archive_dir, warc_basename) + shutil.copy2(warc_source, warc_dest) + logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}') + return warc_dest def _add_wacz_unpacked(self, wacz): wacz = os.path.abspath(wacz) @@ -198,8 +213,9 @@ def _add_wacz_unpacked(self, wacz): warc_destination_path = os.path.join(self.archive_dir, warc_filename) if os.path.exists(warc_destination_path): - logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.') - continue + warc_filename = self._rename_warc(warc_filename) + logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.') + warc_destination_path = os.path.join(self.archive_dir, warc_filename) warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path) diff --git a/tests/test_manager.py b/tests/test_manager.py index f8245ab0..cd34c997 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -20,6 +20,20 @@ def test_add_valid_wacz_unpacked(self, tmp_path): with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: assert '"filename": "valid_example_1-0.warc"' in f.read() + def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path): + """Test if warc that already exists is renamed with -index suffix""" + manager = self.get_test_collections_manager(tmp_path) + manager._add_wacz_unpacked(VALID_WACZ_PATH) + # Add it again to see if there are name conflicts + manager._add_wacz_unpacked(VALID_WACZ_PATH) + assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir) + assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir) + assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir) + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + data = f.read() + assert '"filename": "valid_example_1-0.warc"' in data + assert '"filename": "valid_example_1-0-1.warc"' in data + def test_add_invalid_wacz_unpacked(self, tmp_path, caplog): """Test if adding an invalid wacz file to a collection fails""" manager = self.get_test_collections_manager(tmp_path) @@ -51,6 +65,20 @@ def test_add_valid_archives_unpack_wacz(self, tmp_path): assert archive in os.listdir(manager.archive_dir) assert archive in index_text + def test_add_valid_archives_dupe_name(self, tmp_path): + manager = self.get_test_collections_manager(tmp_path) + warc_filename = 'sample_archive/warcs/example.warc.gz' + manager.add_archives([warc_filename, warc_filename]) + + with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f: + index_text = f.read() + + expected_archives = ('example.warc.gz', 'example-1.warc.gz') + + for archive in expected_archives: + assert archive in os.listdir(manager.archive_dir) + assert archive in index_text + def test_add_valid_archives_dont_unpack_wacz(self, tmp_path): manager = self.get_test_collections_manager(tmp_path) archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',