Skip to content

Commit

Permalink
Add URL support to D.O. load task (#1806)
Browse files Browse the repository at this point in the history
Added functionality, to the digital object load task, to import URLs in
addition to files.

Did minor cleanup and improved CLI help to include a list of valid CSV
columns.
  • Loading branch information
Mike Cantelon committed May 2, 2024
1 parent 11ee01f commit b6a7596
Showing 1 changed file with 92 additions and 33 deletions.
125 changes: 92 additions & 33 deletions lib/task/digitalobject/digitalObjectLoadTask.class.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,19 @@
*/

/**
* Load a csv list of digital objects.
* Load a CSV list of digital objects.
*
* @author David Juhasz <[email protected]>
*/
class digitalObjectLoadTask extends arBaseTask
{
public const IO_SLUG_COLUMN = 'slug';
public const IO_IDENTIFIER_COLUMN = 'identifier';
public const IO_ID_COLUMN = 'information_object_id';
public const PATH_COLUMN = 'filename';

public const IO_SPECIFIER_COLUMNS = [self::IO_SLUG_COLUMN, self::IO_IDENTIFIER_COLUMN, self::IO_ID_COLUMN];

protected static $count = 0;

private $curObjNum = 0;
Expand Down Expand Up @@ -72,22 +79,20 @@ public function execute($arguments = [], $options = [])
// Get header (first) row
$header = fgetcsv($fh, 1000);

if ((!in_array('information_object_id', $header) && !in_array('identifier', $header) && !in_array('slug', $header)) || !in_array('filename', $header)) {
throw new sfException('Import file must contain an \'information_object_id\', an \'identifier\' or a \'slug\' column, and a \'filename\' column');
}
self::validateColumns($header);

$fileKey = array_search('filename', $header);
$fileKey = array_search(self::PATH_COLUMN, $header);

// If information_object_id column is available, use it for id
if (false !== $idKey = array_search('information_object_id', $header)) {
if (false !== $idKey = array_search(self::IO_ID_COLUMN, $header)) {
$idType = 'id';
}
// If no id, then lookup by identifier
elseif (false !== $idKey = array_search('identifier', $header)) {
elseif (false !== $idKey = array_search(self::IO_IDENTIFIER_COLUMN, $header)) {
$idType = 'identifier';
}
// Lookup by slug
elseif (false !== $idKey = array_search('slug', $header)) {
elseif (false !== $idKey = array_search(self::IO_SLUG_COLUMN, $header)) {
$idType = 'slug';
}

Expand Down Expand Up @@ -159,14 +164,14 @@ public function execute($arguments = [], $options = [])
$digitalObjectName = !is_array($item) ? $item : end($item);

if (null !== $results[1]) {
if (file_exists($path = self::getPath($digitalObjectName, $options))) {
if (self::validUrlOrFilePath($digitalObjectName, $options)) {
// get digital object and delete it.
if (null !== $do = QubitDigitalObject::getById($results[1])) {
$do->delete();
++$this->deletedCount;
}
} else {
$this->log(sprintf("Couldn't read file '{$digitalObjectName}'"));
$this->log(sprintf("Couldn't read file or URL '{$digitalObjectName}'"));
++$this->skippedCount;

continue;
Expand All @@ -186,8 +191,8 @@ public function execute($arguments = [], $options = [])
continue;
}

if (!file_exists($path = self::getPath($item, $options))) {
$this->log(sprintf("Couldn't read file '{$item}'"));
if (!self::validUrlOrFilePath($item, $options)) {
$this->log(sprintf("Couldn't read file of URL '{$item}'"));
++$this->skippedCount;

continue;
Expand All @@ -196,8 +201,8 @@ public function execute($arguments = [], $options = [])
self::addDigitalObject($results[0], $item, $options);
} else {
if (!is_array($item)) {
if (!file_exists($path = self::getPath($item, $options))) {
$this->log(sprintf("Couldn't read file '{$item}'"));
if (!self::validUrlOrFilePath($item, $options)) {
$this->log(sprintf("Couldn't read file of URL '{$item}'"));
++$this->skippedCount;

continue;
Expand All @@ -207,8 +212,8 @@ public function execute($arguments = [], $options = [])
} else {
// If more than one digital object linked to this information object
for ($i = 0; $i < count($item); ++$i) {
if (!file_exists($path = self::getPath($item[$i], $options))) {
$this->log(sprintf("Couldn't read file '{$item[$i]}'"));
if (!self::validUrlOrFilePath($item[$i], $options)) {
$this->log(sprintf("Couldn't read file of URL '{$item[$i]}'"));
++$this->skippedCount;

continue;
Expand Down Expand Up @@ -244,8 +249,8 @@ protected function configure()
new sfCommandOption('application', null, sfCommandOption::PARAMETER_OPTIONAL, 'The application name', true),
new sfCommandOption('env', null, sfCommandOption::PARAMETER_REQUIRED, 'The environment', 'cli'),
new sfCommandOption('connection', null, sfCommandOption::PARAMETER_REQUIRED, 'The connection name', 'propel'),
new sfCommandOption('link-source', 's', sfCommandOption::PARAMETER_NONE, 'Link source', null),
new sfCommandOption('path', 'p', sfCommandOption::PARAMETER_OPTIONAL, 'Path prefix for digital objects', null),
new sfCommandOption('link-source', 's', sfCommandOption::PARAMETER_NONE, 'Link source (if importing a file)', null),
new sfCommandOption('path', 'p', sfCommandOption::PARAMETER_OPTIONAL, 'Path or URL prefix for all digital objects', null),
new sfCommandOption('limit', 'l', sfCommandOption::PARAMETER_OPTIONAL, 'Limit number of digital objects imported to n', null),
new sfCommandOption('attach-only', 'a', sfCommandOption::PARAMETER_NONE, 'Always attach digital objects to a new child description', null),
new sfCommandOption('replace', 'r', sfCommandOption::PARAMETER_NONE, 'Delete and replace digital objects', null),
Expand All @@ -255,11 +260,11 @@ protected function configure()

$this->namespace = 'digitalobject';
$this->name = 'load';
$this->briefDescription = 'Load a csv list of digital objects';
$this->briefDescription = 'Load a CSV list of digital objects';

$this->detailedDescription = <<<'EOF'
Load a csv list of digital objects
EOF;
$this->detailedDescription = "Load a CSV list of digital objects\n\n";

$this->detailedDescription .= "Valid CSV columns are '".self::PATH_COLUMN."' and one of: '".implode("', '", self::IO_SPECIFIER_COLUMNS)."'";
}

protected function attachDigitalObject($item, $informationObjectId, $options = [])
Expand All @@ -275,6 +280,28 @@ protected function attachDigitalObject($item, $informationObjectId, $options = [
self::addDigitalObject($informationObject->id, $item, $options);
}

protected function validateColumns($columns)
{
// First check for existance of column indicating file path or URL
$valid = in_array(self::PATH_COLUMN, $columns);

// Second check for existance of an information object specifier column
if ($valid) {
$valid = false;

foreach (self::IO_SPECIFIER_COLUMNS as $specifier_column) {
if (in_array($specifier_column, $columns)) {
$valid = true;
}
}
}

// Throw error if columns aren't valid
if (!$valid) {
throw new sfException("Import file must contain a '".self::PATH_COLUMN."' column and one of the following: '".implode("', '", self::IO_SPECIFIER_COLUMNS)."'");
}
}

protected function getPath($path, $options = [])
{
if (isset($options['path'])) {
Expand All @@ -284,20 +311,44 @@ protected function getPath($path, $options = [])
return $path;
}

protected function addDigitalObject($objectId, $path, $options = [])
protected function validUrlOrFilePath($url_or_path, $options)
{
++$this->curObjNum;
$url_or_path = self::getPath($url_or_path, $options);

$path = self::getPath($path, $options);
// Check first for a file (as this is fastest and most likely)
if (file_exists($url_or_path)) {
return true;
}

$filename = basename($path);
// If it's not a file, assume it's a URL and dismiss if invalid
if (!filter_var($url_or_path, FILTER_VALIDATE_URL)) {
return false;
}

// Check if URL exists
$headers = @get_headers($url_or_path);

if ($headers && strpos($headers[0], '200')) {
return true;
}

// Not a file path or valid, existing URL
return false;
}

protected function addDigitalObject($objectId, $path, $options = [])
{
++$this->curObjNum;

if (!file_exists($path)) {
$this->log("Couldn't read file '{$path}'");
if (!self::validUrlOrFilePath($path, $options)) {
$this->log("Couldn't read file or URL '{$path}'");

return;
}

$path = self::getPath($path, $options);
$filename = basename($path);

$remainingImportCount = $this->totalObjCount - $this->skippedCount - $importedCount;
$operation = $options['replace'] ? 'Replacing with' : 'Loading';
$message = sprintf("%s '%s' (%d of %d remaining", $operation, $filename, $this->curObjNum, $remainingImportCount);
Expand All @@ -313,13 +364,21 @@ protected function addDigitalObject($objectId, $path, $options = [])
$do = new QubitDigitalObject();
$do->objectId = $objectId;

if ($options['link-source']) {
if (false === $do->importFromFile($path)) {
return;
if (file_exists($path)) {
// Add digital object from file
if ($options['link-source']) {
if (false === $do->importFromFile($path)) {
return;
}
} else {
$do->usageId = QubitTerm::MASTER_ID;
$do->assets[] = new QubitAsset($path);
}
} else {
$do->usageId = QubitTerm::MASTER_ID;
$do->assets[] = new QubitAsset($path);
// Add digital object from URL
if (false === $do->importFromURI($path)) {
return;
}
}

$do->save($options['conn']);
Expand Down

0 comments on commit b6a7596

Please sign in to comment.