Skip to content

Commit

Permalink
Merge pull request pulibrary#661 from pulibrary/repo-ocr
Browse files Browse the repository at this point in the history
Adding OCR to the repository
  • Loading branch information
Trey Pendragon committed Jun 7, 2016
2 parents d218a84 + 1b2d8a3 commit db4363f
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 3 deletions.
13 changes: 11 additions & 2 deletions app/services/ocr_runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,22 @@ def from_file(filename)

def from_datastream
Hydra::Derivatives::TempfileService.create(resource.original_file) do |f|
from_file(f.path)
ocr_output = from_file(f.path)
attach_ocr(ocr_filename(ocr_output))
end
resource.update_index
resource.save
end

private

def attach_ocr(filename)
Hydra::Works::AddFileToFileSet.call(resource, File.open(filename), :extracted_text)
end

def ocr_filename(ocr_output)
ocr_output.first[:url].sub(/^file:/, '')
end

def creator_factory
OCRCreator
end
Expand Down
7 changes: 6 additions & 1 deletion spec/models/file_set_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@

expect(path).to exist
end
it "creates full text and indexes it" do
it "creates full text, attaches it to the object, and indexes it" do
allow_any_instance_of(described_class).to receive(:warn) # suppress virus check warnings
allow(Hydra::Derivatives::Jpeg2kImageDerivatives).to receive(:create).and_return(true)
file = File.open(Rails.root.join("spec", "fixtures", "files", "page18.tif"))
Expand All @@ -55,6 +55,11 @@

expect(ocr_path).to exist
expect(subject.to_solr["full_text_tesim"]).to eq "yo"

# verify that ocr has been added to the FileSet
subject.reload
expect(subject.files.size).to eq(2)
expect(subject.files.last.content).to include "<div class='ocr_page'"
end
after do
FileUtils.rm_rf(path.parent) if path.exist?
Expand Down

0 comments on commit db4363f

Please sign in to comment.