Skip to content

Commit

Permalink
Merge pull request #76 from psu-libraries/#23-mapping-subjects
Browse files Browse the repository at this point in the history
mapping subjects  #23
  • Loading branch information
cdmo authored Jan 15, 2019
2 parents 5b8ad28 + 93b76ed commit f362c32
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 24 deletions.
17 changes: 13 additions & 4 deletions .rubocop_todo.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# This configuration was generated by
# `rubocop --auto-gen-config`
# on 2018-11-14 15:12:24 -0500 using RuboCop version 0.60.0.
# on 2019-01-11 16:17:21 -0500 using RuboCop version 0.60.0.
# The point is for the user to remove these configuration records
# one by one as the offenses are removed from the code base.
# Note that changes in the inspected code, or installation of new
Expand All @@ -11,19 +11,28 @@ Lint/EmptyWhen:
Exclude:
- 'lib/traject/psulib_config.rb'

# Offense count: 1
# Offense count: 2
Metrics/AbcSize:
Max: 26

# Offense count: 5
# Configuration parameters: CountComments, ExcludedMethods.
# ExcludedMethods: refine
Metrics/BlockLength:
Max: 65

# Offense count: 2
# Configuration parameters: CountComments, ExcludedMethods.
Metrics/MethodLength:
Max: 16

# Offense count: 3
Style/MixinUsage:
Exclude:
- 'lib/traject/psulib_config.rb'

# Offense count: 36
# Offense count: 77
# Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
# URISchemes: http, https
Metrics/LineLength:
Max: 177
Max: 231
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,5 @@ For testing purposes you can run traject with the `--debug-mode` flag to
display the output to the console (and not push the data to Solr).
```
$ bundle exec traject --debug-mode -c lib/traject/psulib_config.rb /full/path/to/marcfile.mrc
$ bundle exec traject --debug-mode -c lib/traject/psulib_config.rb solr/sample_data/sample_psucat.mrc
```
45 changes: 26 additions & 19 deletions lib/traject/psulib_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
require 'traject/marc4j_reader' if is_jruby
require 'traject/macros/marc21_semantics'
require 'traject/macros/marc_format_classifier'
require_relative './psulib_marc'

extend Traject::Macros::Marc21
extend Traject::Macros::Marc21Semantics
Expand Down Expand Up @@ -160,28 +161,34 @@
## Author sorting field
to_field 'author_ssort', marc_sortable_author

# Subject fields
# Subject field(s):
## Primary subject
to_field 'subject_tsim', extract_marc(%W[
600#{ATOU}
610#{ATOU}
611#{ATOU}
630#{ATOU}
650abcde
651ae
653a:654abcde:655abc
].join(':'))
to_field 'subject_tsim', extract_marc('600abcdfklmnopqrtvxyz:610abfklmnoprstvxyz:611abcdefgklnpqstvxyz:630adfgklmnoprstvxyz:647acdg:648a:650abcd:651a:653a:654ab')

## Additional subject fields
to_field 'subject_addl_tsim', extract_marc('600vwxyz:610vwxyz:611vwxyz:630vwxyz:650vwxyz:651vwxyz:654vwxyz:655vwxyz')
to_field 'subject_topic_facet_ssim', extract_marc('600|*0|abcdq:610|*0|ab:611|*0|ab:630|*0|ab:650|*0|a:653|*0|a', trim_punctuation: true) do |record, accumulator, _context|
# Include Fast Headings
MarcExtractor.new('650|*7|2').collect_matching_lines(record) do |field, _spec, _extractor|
if field['2'].to_s.downcase.include? 'fast'
fast_subject = Marc21.trim_punctuation field['a']
accumulator << fast_subject unless fast_subject.nil?
end
end
to_field 'subject_addl_tsim', extract_marc('600vxyz:610vxyz:611vxyz:630vxyz:647vxyz:648vxyz:650vxyz:651vxyz:654vyz')

## Subject display
hierarchy_fields = '650|*0|abcdvxyz:650|*2|abcdvxyz:650|*1|abcdvxyz:650|*3|abcdvxyz:650|*6|abcdvxyz:650|*7|abcdvxyz:600abcdfklmnopqrtvxyz:610abfklmnoprstvxyz:611abcdefgklnpqstvxyz:630adfgklmnoprstvxyz:647acdgvxyz:648avxyz:651avxyz'
to_field 'subject_display_ssm' do |record, accumulator|
subjects = process_hierarchy(record, hierarchy_fields)
accumulator.replace(subjects)
accumulator.compact!
accumulator.uniq!
end

# For hierarchical subject display
to_field 'subject_facet' do |record, accumulator|
subjects = process_hierarchy(record, hierarchy_fields)
accumulator.replace(subjects)
accumulator.compact!
accumulator.uniq!
end

# Subject facet (sidebar)
to_field 'subject_topic_facet_ssim' do |record, accumulator|
subjects = process_subject_topic_facet(record, '650|*0|aa:650|*0|x:650|*1|aa:650|*1|x:651|*0|a:651|*0|x:600abcdtq:610abt:610x:611abt:611x')
accumulator.replace(subjects)
accumulator.compact!
accumulator.uniq!
end
Expand Down
42 changes: 42 additions & 0 deletions lib/traject/psulib_marc.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
SEPARATOR = '—'.freeze

# for the hierarchical subject/genre display
# split with em dash along v,x,y,z
# optional vocabulary argument for whitelisting subfield $2 vocabularies
def process_hierarchy(record, fields, vocabulary = [])
subjects = []
split_on_subfield = %w[v x y z]
Traject::MarcExtractor.cached(fields).collect_matching_lines(record) do |field, spec, extractor|
subject = extractor.collect_subfields(field, spec).first
include_subject = vocabulary.empty? # always include the subject if a vocabulary is not specified
unless subject.nil?
field.subfields.each do |s_field|
# when specified, only include subject if it is part of the vocabulary
include_subject = vocabulary.include?(s_field.value) if s_field.code == '2' && !vocabulary.empty?
subject = subject.gsub(" #{s_field.value}", "#{SEPARATOR}#{s_field.value}") if split_on_subfield.include?(s_field.code)
end
subject = subject.split(SEPARATOR)
subject = subject.map { |s| Traject::Macros::Marc21.trim_punctuation(s) }.join(SEPARATOR)
subjects << subject if include_subject
end
end
subjects
end

# for the split subject facet
# split with em dash along v,x,y,z
def process_subject_topic_facet(record, fields)
subjects = []
split_on_subfield = %w[v x y z]
Traject::MarcExtractor.cached(fields).collect_matching_lines(record) do |field, spec, extractor|
subject = extractor.collect_subfields(field, spec).first
unless subject.nil?
field.subfields.each do |s_field|
subject = subject.gsub(" #{s_field.value}", "#{SEPARATOR}#{s_field.value}") if split_on_subfield.include?(s_field.code)
end
subject = subject.split(SEPARATOR)
subjects << subject.map { |s| Traject::Macros::Marc21.trim_punctuation(s) }
end
end
subjects.flatten
end
58 changes: 58 additions & 0 deletions spec/lib/traject/psulib_marc_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# frozen_string_literal: true

require_relative '../../../lib/traject/psulib_marc'

RSpec.describe 'From psulib_marc.rb' do
describe 'process_hierarchy function' do
before(:all) do
@s610 = { '600' => { 'ind1' => '', 'ind2' => '5', 'subfields' => [{ 'a' => 'Exclude' }] } }
@s600 = { '600' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'a' => 'John.' }, { 't' => 'Title.' }, { 'v' => 'split genre' }, { 'd' => '2015' }, { '2' => 'special' }] } }
@s630 = { '630' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'x' => 'Fiction' }, { 'y' => '1492' }, { 'z' => "don't ignore" }, { 't' => 'TITLE.' }] } }
@sample_marc = MARC::Record.new_from_hash('fields' => [@s610, @s600, @s630])
@subjects = process_hierarchy(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz')
@vocab_subjects = process_hierarchy(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz', ['vocab'])
@special_subjects = process_hierarchy(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz', ['special'])
end
describe 'when an optional vocabulary limit is not provided' do
it 'excludes subjects without 0 in the 2nd indicator' do
expect(@subjects).not_to include('Exclude')
expect(@subjects).not_to include('Also Exclude')
end

it 'only separates v,x,y,z with em dash, strips punctuation' do
expect(@subjects).to include("John. Title#{SEPARATOR}split genre 2015")
expect(@subjects).to include("Fiction#{SEPARATOR}1492#{SEPARATOR}don't ignore TITLE")
end
end

describe 'when a vocabulary limit is provided' do
it 'excludes headings missing a subfield 2 or part of a different vocab' do
expect(@vocab_subjects).to eq []
end
it 'only includes the heading from a matching subfield 2 value' do
expect(@special_subjects).to eq ["John. Title#{SEPARATOR}split genre 2015"]
end
end
end

describe 'process_subject_topic_facet function' do
before(:all) do
@s600 = { '600' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'a' => 'John.' }, { 'x' => 'Join' }, { 't' => 'Title' }, { 'd' => '2015' }] } }
@s630 = { '630' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'x' => 'Fiction' }, { 'y' => '1492' }, { 'z' => "don't ignore" }, { 'v' => 'split genre' }] } }
@sample_marc = MARC::Record.new_from_hash('fields' => [@s600, @s630])
@subjects = process_subject_topic_facet(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz')
end

it 'trims punctuation' do
expect(@subjects).to include('John')
end

it 'includes subjects split along v, x, y and z' do
expect(@subjects).to include('Join Title 2015')
expect(@subjects).to include('1492')
expect(@subjects).to include('split genre')
expect(@subjects).to include('Fiction')
expect(@subjects).to include("don't ignore")
end
end
end

0 comments on commit f362c32

Please sign in to comment.