-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
147 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
<?php | ||
|
||
namespace mik\fetchermanipulators; | ||
use League\CLImate\CLImate; | ||
use \Monolog\Logger; | ||
|
||
/** | ||
* @file | ||
* Fetcher manipulator that filters out record keys that have | ||
* corresponding payload files in the output directory. In other | ||
* words, this manipulator limits the MIK job to records that | ||
* do not have a file in the output directory. Useful for resuming | ||
* a failed job, etc. Applies to OAI-PMH toolchains. | ||
* | ||
* MIK's --limit parameter applies as if this manipulator were | ||
* absent. If the identifiers listed in the input file match | ||
* records retrieved within the limit, they are included in the | ||
* set processed by MIK; if not, they are excluded from the set | ||
* processed by MIK. Since the speicifc set is by definition a | ||
* limit on how many records are processed, the --limit parameter | ||
* is not usually used in conjuction with this manipulator. | ||
*/ | ||
|
||
class OaiMissingFileSet extends FetcherManipulator | ||
{ | ||
/** | ||
* Create a new OaiMissingFileSet fetchermanipulator Instance. | ||
* | ||
* @param array $settings | ||
* All of the settings from the .ini file. | ||
* | ||
* @param array $manipulator_settings | ||
* This manipulator takes no parameters, this is a placeholder. | ||
*/ | ||
public function __construct($settings, $manipulator_settings) | ||
{ | ||
$this->settings = $settings; | ||
$this->outputDirectory = $this->settings['WRITER']['output_directory']; | ||
|
||
// To get the value of $onWindows. | ||
parent::__construct($settings); | ||
// Set up logger. | ||
$this->pathToLog = $this->settings['LOGGING']['path_to_manipulator_log']; | ||
$this->log = new \Monolog\Logger('OaiMissingFileSet'); | ||
$this->logStreamHandler = new \Monolog\Handler\StreamHandler($this->pathToLog, | ||
Logger::INFO); | ||
$this->log->pushHandler($this->logStreamHandler); | ||
} | ||
|
||
/** | ||
* Selects a specific subset of records. | ||
* | ||
* @param array $all_records | ||
* All of the records from the fetcher. | ||
* @return array $filtered_records | ||
* An array of records do not have corresponding files in the output directory. | ||
*/ | ||
public function manipulate($all_records) | ||
{ | ||
$numRecs = count($all_records); | ||
echo "Filtering $numRecs records through the OaiMissingFileSet fetcher manipulator.\n"; | ||
// Instantiate the progress bar if we're not running on Windows. | ||
if (!$this->onWindows) { | ||
$climate = new \League\CLImate\CLImate; | ||
$progress = $climate->progress()->total($numRecs); | ||
} | ||
|
||
$record_keys_with_files = $this->getRecordKeysWithFiles(); | ||
var_dump($record_keys_with_files); | ||
|
||
$record_num = 0; | ||
$filtered_records = array(); | ||
foreach ($all_records as $record) { | ||
if (!in_array($record->key, $record_keys_with_files)) { | ||
$filtered_records[] = $record; | ||
} | ||
|
||
$record_num++; | ||
if ($this->onWindows) { | ||
print '.'; | ||
} | ||
else { | ||
$progress->current($record_num); | ||
} | ||
} | ||
if ($this->onWindows) { | ||
print "\n"; | ||
} | ||
|
||
return $filtered_records; | ||
} | ||
|
||
/** | ||
* Populates a list of object record keys from the files present | ||
* in the output directory. | ||
* | ||
* @return array | ||
* The list of record keys (i.e., OAI-PMH identifiers) that do not | ||
* have a corresponding file in the output directory. | ||
*/ | ||
public function getRecordKeysWithFiles() | ||
{ | ||
$record_keys_with_files = array(); | ||
foreach ($this->getFileList() as &$file) { | ||
$filename = pathinfo($file, PATHINFO_FILENAME); | ||
$record_keys_with_files[] = $this->denormalizeFilename($filename); | ||
} | ||
|
||
return $record_keys_with_files; | ||
} | ||
|
||
/** | ||
* Reads the output directory and returns a list of files that do not | ||
* end in .xml or .log. | ||
* | ||
* @return array | ||
* An array of absolutute file paths. | ||
*/ | ||
public function getFileList() | ||
{ | ||
$file_list = array(); | ||
$filetered_file_list = array(); | ||
$pattern = $this->outputDirectory . DIRECTORY_SEPARATOR . "*"; | ||
$file_list = glob($pattern); | ||
foreach ($file_list as $file_path) { | ||
if (!preg_match('/\.(xml|log)$/', $file_path) ) { | ||
$filtered_file_list[] = $file_path; | ||
} | ||
} | ||
|
||
return $filtered_file_list; | ||
} | ||
|
||
/** | ||
* Names of files retrieved in OAI-PMH toolchaines are normalized | ||
* to convert %3A (:) into underscores (_). This function converts | ||
* them back so that filenames will match OAI-PMH identifiers. | ||
*/ | ||
public function denormalizeFilename($string) | ||
{ | ||
$string = preg_replace('/_/', ':', $string); | ||
$string = urlencode($string); | ||
|
||
return $string; | ||
} | ||
|
||
} |