Skip to content

Commit

Permalink
Compare an accruals location to an aip store
Browse files Browse the repository at this point in the history
This commit introduces an accruals->aips comparison capability.

Digital objects in an accruals folder can now be compared to the
contents of an AIP store.

Where filepaths and checksums and dates match, the object is
considered to be identical (a true duplicate). Where they don't,
users can use modulo (%) to identify where the object isn't in fact
identical.

Much of the benefit of this work is derived from the nature of the
AIP structure imposed on a digital transfer.

Once the comparison is complete, three reports are output in CSV
format:

 * True-duplicates.
 * Near-duplicates (checksums match, but other components might not).
 * Non-duplicates.

Additionally a summary report output in JSON.
  • Loading branch information
ross-spencer committed Jul 8, 2019
1 parent a0d4eee commit cab6f33
Show file tree
Hide file tree
Showing 11 changed files with 566 additions and 149 deletions.
161 changes: 161 additions & 0 deletions reports/duplicates/accruals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import print_function, unicode_literals

import copy
import logging
import os
import sys

try:
from .appconfig import AppConfig
from .digital_object import DigitalObject
from . import duplicates
from . import loggingconfig
from .serialize_to_csv import CSVOut
from . import utils
except (ValueError, ImportError):
from appconfig import AppConfig
from digital_object import DigitalObject
import duplicates
import loggingconfig
from serialize_to_csv import CSVOut
import utils

logging_dir = os.path.dirname(os.path.abspath(__file__))

logger = logging.getLogger("accruals")
logger.disabled = False

# Location purpose = Transfer Source (TS)
location_purpose = "TS"
default_location = AppConfig().accruals_transfer_source


# Do something with this...
DOCKER = True

# Store our appraisal paths.
accrual_paths = []


def create_manifest(aip_index, accrual_objs):
"""do something."""
dupes = []
near_matches = []
non_matches = []
aip_obj_hashes = aip_index.get(duplicates.MANIFEST_DATA)
for accrual_obj in accrual_objs:
for accrual_hash in accrual_obj.hashes:
if accrual_hash in aip_obj_hashes.keys():
for _, aip_items in aip_obj_hashes.items():
for aip_item in aip_items:
if accrual_obj == aip_item:
accrual_obj.flag = True
cp = copy.copy(accrual_obj)
cp.package_name = aip_item.package_name
dupes.append(cp)
else:
diff = accrual_obj % aip_item
if (
diff == "No matching components"
or "checksum match" not in diff
):
"""Don't output."""
continue
accrual_obj.flag = True
cp1 = copy.copy(accrual_obj)
cp2 = copy.copy(aip_item)
near_matches.append([cp1, cp2])
# Only need one hash to match then break.
# May also be redundant as we only have one hash from the
# bag manifests...
break
for accrual_obj in accrual_objs:
if accrual_obj.flag is False:
cp = copy.copy(accrual_obj)
if cp not in non_matches:
non_matches.append(cp)
return dupes, near_matches, non_matches


def create_comparison_obj(transfer_path):
"""Do something."""
transfer_arr = []
for root, dirs, files in os.walk(transfer_path, topdown=True):
for name in files:
file_ = os.path.join(root, name)
if os.path.isfile(file_):
transfer_arr.append(DigitalObject(file_, transfer_path))
return transfer_arr


def stat_transfers(accruals_path, all_transfers):
"""Retrieve all transfer paths and make a request to generate statistics
about all the objects in that transfer path.
"""
aip_index = duplicates.retrieve_aip_index()
dupe_reports = []
near_reports = []
no_match_reports = []
transfers = []
for transfer in all_transfers:
transfer_home = os.path.join(accruals_path, transfer)
if DOCKER:
transfer_home = utils.get_docker_path(transfer_home)
objs = create_comparison_obj(transfer_home)
transfers.append(objs)
match_manifest, near_manifest, no_match_manifest = create_manifest(
aip_index, objs
)
if match_manifest:
dupe_reports.append({transfer: match_manifest})
if near_manifest:
near_reports.append({transfer: near_manifest})
if no_match_manifest:
no_match_reports.append({transfer: no_match_manifest})
CSVOut.stat_manifests(aip_index, transfers)
if dupe_reports:
CSVOut.dupe_csv_out(dupe_reports, "")
if near_reports:
CSVOut.near_csv_out(near_reports, "")
if no_match_reports:
CSVOut.no_match_csv_out(no_match_reports, "")


def main(location=default_location):
"""Primary entry point for this script."""

am = AppConfig().get_am_client()
sources = am.list_storage_locations()

accruals = False
for source in sources.get("objects"):
if (
source.get("purpose") == location_purpose
and source.get("description") == location
):
"""do something."""
am.transfer_source = source.get("uuid")
am.transfer_path = source.get("path")
accruals = True
if not accruals:
logger.info("Exiting. No transfer source: {}".format(location))
sys.exit()

# All transfer directories. Assumption is the same as Archivematica that
# each transfer is organized into a single directory at this level.
all_transfers = am.transferables().get("directories")
stat_transfers(am.transfer_path, all_transfers)


if __name__ == "__main__":
loggingconfig.setup("INFO", os.path.join(logging_dir, "report.log"))
source = default_location
try:
source = sys.argv[1:][0]
logger.error("Attempting to find transfers at: %s", source)
except IndexError:
pass
sys.exit(main(source))
1 change: 1 addition & 0 deletions reports/duplicates/appconfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def _load_config(self, config_file):
self.storage_service_user = conf.get("storage_service_user")
self.storage_service_api_key = conf.get("storage_service_api_key")
self.storage_service_url = conf.get("storage_service_url")
self.accruals_transfer_source = conf.get("accruals_transfer_source")

def get_am_client(self):
"""Return an Archivematica API client to the caller."""
Expand Down
3 changes: 2 additions & 1 deletion reports/duplicates/config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
{
"storage_service_url": "http://127.0.0.1:62081",
"storage_service_user": "test",
"storage_service_api_key": "test"
"storage_service_api_key": "test",
"accruals_transfer_source": "accruals"
}
129 changes: 129 additions & 0 deletions reports/duplicates/digital_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Digital object class to help with matching."""

import json
import os
import time

try:
from . import hashutils
except (ValueError, ImportError):
import hashutils


class DigitalObjectException(Exception):
"""If there's a problem raise this."""


class DigitalObject(object):

# Object members.
basename = None
date_modified = None
dirname = None
filepath = None
hashes = None
package_uuid = None
package_name = None

def __init__(self, path=None, transfer_path=None):
"""Populate the digital object metadata. If we don't supply a path
we'll just return an empty object to be populated on our own terms.
"""
if not path:
self.basename = None
self.date_modified = None
self.dirname = None
self.filepath = None
self.hashes = []
self.package_uuid = None
self.package_name = None
self.flag = False

if path:
if not transfer_path:
raise DigitalObjectException("Transfer path isn't set")
# Construct path as if it is in a Bag object.
comparison_path = path.replace(
transfer_path, os.path.join("data", "objects")
)
self.filepath = comparison_path
self.set_basename(comparison_path)
self.set_dirname(comparison_path)
self.hashes = hashutils.hash(path)
self.date_modified = self.get_timestamp(path)
self.flag = False

def set_basename(self, path):
"""do something."""
self.basename = os.path.basename(path)

def set_dirname(self, path):
"""do something."""
self.dirname = os.path.dirname(path)

def as_dict(self):
return self.__dict__

def __str__(self):
"""Let's override this!"""
return json.dumps(
self.__dict__, sort_keys=True, indent=4, separators=(",", ": ")
)

def __eq__(self, other):
"""Comparison operator for the digital object class. If two hashes
match, and the given file path, we will return True.
"""
ret = False
for key in self.hashes.keys():
if key in other.hashes.keys():
ret = True
break
if self.filepath != other.filepath:
ret = False
if self.date_modified != other.date_modified:
ret = False
return ret

def __mod__(self, other):
"""Modulo operator for the digital object class. If two hashes match,
and the given file-path, then return zero. If there is any partial
match, then return basis information. % is potentially useful for
debugging, or enhanced reporting.
"""
if self.__eq__(other):
return 0
# ret is False, repurpose to return basis information.
ret = ""
for key in self.hashes.keys():
if key in other.hashes.keys():
msg = "checksum match"
ret = self.__concat_basis__(ret, msg)
break
if self.date_modified == other.date_modified:
msg = "date modified match"
ret = self.__concat_basis__(ret, msg)
if self.basename == other.basename:
msg = "filename match"
ret = self.__concat_basis__(ret, msg)
if self.dirname == other.dirname:
msg = "directory name match"
ret = self.__concat_basis__(ret, msg)
if not ret:
return "No matching components"
return ret

@staticmethod
def __concat_basis__(ret, msg):
"""Helper function to bring basis information together usefully."""
if ret:
return "{}; {}".format(ret, msg)
return msg

@staticmethod
def get_timestamp(path):
"""do something."""
return time.strftime("%Y-%m-%d", time.localtime(os.path.getmtime(path)))
Loading

0 comments on commit cab6f33

Please sign in to comment.