From d2905b47ad72d675d9e0da511214b573e3142657 Mon Sep 17 00:00:00 2001 From: Kuechensofa Date: Mon, 16 Jan 2023 23:51:05 +0100 Subject: [PATCH 1/2] [#795] Show error when adding duplicate warc file Show an error when adding a warc file that has the same filename as an existing file in the collection and do not overwrite the existing file. --- pywb/manager/manager.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 3d56d53e2..824fbfa69 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -123,6 +123,10 @@ def add_warcs(self, warcs): full_paths = [] for filename in warcs: filename = os.path.abspath(filename) + + if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): + raise IOError(f'Warc {filename} already exists') + shutil.copy2(filename, self.archive_dir) full_paths.append(os.path.join(self.archive_dir, filename)) logging.info('Copied ' + filename + ' to ' + self.archive_dir) From cd7031a94839232c94903f2398e2a5461b2da82a Mon Sep 17 00:00:00 2001 From: Kuechensofa Date: Thu, 19 Jan 2023 14:35:43 +0100 Subject: [PATCH 2/2] Add not duplicate warcs when skipping duplicate warcs Skip duplicate warcs and print a warning but add and index warcs that don't have duplicates. --- pywb/manager/manager.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pywb/manager/manager.py b/pywb/manager/manager.py index 824fbfa69..18c76ee77 100644 --- a/pywb/manager/manager.py +++ b/pywb/manager/manager.py @@ -121,11 +121,14 @@ def add_warcs(self, warcs): format(self.archive_dir)) full_paths = [] + duplicate_warcs = [] for filename in warcs: filename = os.path.abspath(filename) + # don't overwrite existing warcs with duplicate names if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): - raise IOError(f'Warc {filename} already exists') + duplicate_warcs.append(filename) + continue shutil.copy2(filename, self.archive_dir) full_paths.append(os.path.join(self.archive_dir, filename)) @@ -133,6 +136,9 @@ def add_warcs(self, warcs): self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE) + if duplicate_warcs: + logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.') + def reindex(self): cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE) logging.info('Indexing ' + self.archive_dir + ' to ' + cdx_file)