Merge pull request #76 from psu-libraries/#23-mapping-subjects

mapping subjects #23
psu-libraries · Jan 15, 2019 · f362c32 · f362c32
2 parents 5b8ad28 + 93b76ed
commit f362c32
Show file tree

Hide file tree

Showing 5 changed files with 140 additions and 24 deletions.
diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml
@@ -1,6 +1,6 @@
 # This configuration was generated by
 # `rubocop --auto-gen-config`
-# on 2018-11-14 15:12:24 -0500 using RuboCop version 0.60.0.
+# on 2019-01-11 16:17:21 -0500 using RuboCop version 0.60.0.
 # The point is for the user to remove these configuration records
 # one by one as the offenses are removed from the code base.
 # Note that changes in the inspected code, or installation of new
@@ -11,19 +11,28 @@ Lint/EmptyWhen:
   Exclude:
     - 'lib/traject/psulib_config.rb'
 
-# Offense count: 1
+# Offense count: 2
+Metrics/AbcSize:
+  Max: 26
+
+# Offense count: 5
 # Configuration parameters: CountComments, ExcludedMethods.
 # ExcludedMethods: refine
 Metrics/BlockLength:
   Max: 65
 
 # Offense count: 2
+# Configuration parameters: CountComments, ExcludedMethods.
+Metrics/MethodLength:
+  Max: 16
+
+# Offense count: 3
 Style/MixinUsage:
   Exclude:
     - 'lib/traject/psulib_config.rb'
 
-# Offense count: 36
+# Offense count: 77
 # Configuration parameters: AllowHeredoc, AllowURI, URISchemes, IgnoreCopDirectives, IgnoredPatterns.
 # URISchemes: http, https
 Metrics/LineLength:
-  Max: 177
+  Max: 231
diff --git a/README.md b/README.md
@@ -57,5 +57,5 @@ For testing purposes you can run traject with the `--debug-mode` flag to
 display the output to the console (and not push the data to Solr).
 
 ```
-$ bundle exec traject --debug-mode -c lib/traject/psulib_config.rb /full/path/to/marcfile.mrc
+$ bundle exec traject --debug-mode -c lib/traject/psulib_config.rb solr/sample_data/sample_psucat.mrc
 ```
diff --git a/lib/traject/psulib_config.rb b/lib/traject/psulib_config.rb
@@ -7,6 +7,7 @@
 require 'traject/marc4j_reader' if is_jruby
 require 'traject/macros/marc21_semantics'
 require 'traject/macros/marc_format_classifier'
+require_relative './psulib_marc'
 
 extend Traject::Macros::Marc21
 extend Traject::Macros::Marc21Semantics
@@ -160,28 +161,34 @@
 ## Author sorting field
 to_field 'author_ssort', marc_sortable_author
 
-# Subject fields
+# Subject field(s):
 ## Primary subject
-to_field 'subject_tsim', extract_marc(%W[
-  600#{ATOU}
-  610#{ATOU}
-  611#{ATOU}
-  630#{ATOU}
-  650abcde
-  651ae
-  653a:654abcde:655abc
-].join(':'))
+to_field 'subject_tsim', extract_marc('600abcdfklmnopqrtvxyz:610abfklmnoprstvxyz:611abcdefgklnpqstvxyz:630adfgklmnoprstvxyz:647acdg:648a:650abcd:651a:653a:654ab')
 
 ## Additional subject fields
-to_field 'subject_addl_tsim', extract_marc('600vwxyz:610vwxyz:611vwxyz:630vwxyz:650vwxyz:651vwxyz:654vwxyz:655vwxyz')
-to_field 'subject_topic_facet_ssim', extract_marc('600|*0|abcdq:610|*0|ab:611|*0|ab:630|*0|ab:650|*0|a:653|*0|a', trim_punctuation: true) do |record, accumulator, _context|
-  # Include Fast Headings
-  MarcExtractor.new('650|*7|2').collect_matching_lines(record) do |field, _spec, _extractor|
-    if field['2'].to_s.downcase.include? 'fast'
-      fast_subject = Marc21.trim_punctuation field['a']
-      accumulator << fast_subject unless fast_subject.nil?
-    end
-  end
+to_field 'subject_addl_tsim', extract_marc('600vxyz:610vxyz:611vxyz:630vxyz:647vxyz:648vxyz:650vxyz:651vxyz:654vyz')
+
+## Subject display
+hierarchy_fields = '650|*0|abcdvxyz:650|*2|abcdvxyz:650|*1|abcdvxyz:650|*3|abcdvxyz:650|*6|abcdvxyz:650|*7|abcdvxyz:600abcdfklmnopqrtvxyz:610abfklmnoprstvxyz:611abcdefgklnpqstvxyz:630adfgklmnoprstvxyz:647acdgvxyz:648avxyz:651avxyz'
+to_field 'subject_display_ssm' do |record, accumulator|
+  subjects = process_hierarchy(record, hierarchy_fields)
+  accumulator.replace(subjects)
+  accumulator.compact!
+  accumulator.uniq!
+end
+
+# For hierarchical subject display
+to_field 'subject_facet' do |record, accumulator|
+  subjects = process_hierarchy(record, hierarchy_fields)
+  accumulator.replace(subjects)
+  accumulator.compact!
+  accumulator.uniq!
+end
+
+# Subject facet (sidebar)
+to_field 'subject_topic_facet_ssim' do |record, accumulator|
+  subjects = process_subject_topic_facet(record, '650|*0|aa:650|*0|x:650|*1|aa:650|*1|x:651|*0|a:651|*0|x:600abcdtq:610abt:610x:611abt:611x')
+  accumulator.replace(subjects)
   accumulator.compact!
   accumulator.uniq!
 end

diff --git a/lib/traject/psulib_marc.rb b/lib/traject/psulib_marc.rb
@@ -0,0 +1,42 @@
+SEPARATOR = '—'.freeze
+
+# for the hierarchical subject/genre display
+# split with em dash along v,x,y,z
+# optional vocabulary argument for whitelisting subfield $2 vocabularies
+def process_hierarchy(record, fields, vocabulary = [])
+  subjects = []
+  split_on_subfield = %w[v x y z]
+  Traject::MarcExtractor.cached(fields).collect_matching_lines(record) do |field, spec, extractor|
+    subject = extractor.collect_subfields(field, spec).first
+    include_subject = vocabulary.empty? # always include the subject if a vocabulary is not specified
+    unless subject.nil?
+      field.subfields.each do |s_field|
+        # when specified, only include subject if it is part of the vocabulary
+        include_subject = vocabulary.include?(s_field.value) if s_field.code == '2' && !vocabulary.empty?
+        subject = subject.gsub(" #{s_field.value}", "#{SEPARATOR}#{s_field.value}") if split_on_subfield.include?(s_field.code)
+      end
+      subject = subject.split(SEPARATOR)
+      subject = subject.map { |s| Traject::Macros::Marc21.trim_punctuation(s) }.join(SEPARATOR)
+      subjects << subject if include_subject
+    end
+  end
+  subjects
+end
+
+# for the split subject facet
+# split with em dash along v,x,y,z
+def process_subject_topic_facet(record, fields)
+  subjects = []
+  split_on_subfield = %w[v x y z]
+  Traject::MarcExtractor.cached(fields).collect_matching_lines(record) do |field, spec, extractor|
+    subject = extractor.collect_subfields(field, spec).first
+    unless subject.nil?
+      field.subfields.each do |s_field|
+        subject = subject.gsub(" #{s_field.value}", "#{SEPARATOR}#{s_field.value}") if split_on_subfield.include?(s_field.code)
+      end
+      subject = subject.split(SEPARATOR)
+      subjects << subject.map { |s| Traject::Macros::Marc21.trim_punctuation(s) }
+    end
+  end
+  subjects.flatten
+end
diff --git a/spec/lib/traject/psulib_marc_spec.rb b/spec/lib/traject/psulib_marc_spec.rb
@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+require_relative '../../../lib/traject/psulib_marc'
+
+RSpec.describe 'From psulib_marc.rb' do
+  describe 'process_hierarchy function' do
+    before(:all) do
+      @s610 = { '600' => { 'ind1' => '', 'ind2' => '5', 'subfields' => [{ 'a' => 'Exclude' }] } }
+      @s600 = { '600' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'a' => 'John.' }, { 't' => 'Title.' }, { 'v' => 'split genre' }, { 'd' => '2015' }, { '2' => 'special' }] } }
+      @s630 = { '630' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'x' => 'Fiction' }, { 'y' => '1492' }, { 'z' => "don't ignore" }, { 't' => 'TITLE.' }] } }
+      @sample_marc = MARC::Record.new_from_hash('fields' => [@s610, @s600, @s630])
+      @subjects = process_hierarchy(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz')
+      @vocab_subjects = process_hierarchy(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz', ['vocab'])
+      @special_subjects = process_hierarchy(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz', ['special'])
+    end
+    describe 'when an optional vocabulary limit is not provided' do
+      it 'excludes subjects without 0 in the 2nd indicator' do
+        expect(@subjects).not_to include('Exclude')
+        expect(@subjects).not_to include('Also Exclude')
+      end
+
+      it 'only separates v,x,y,z with em dash, strips punctuation' do
+        expect(@subjects).to include("John. Title#{SEPARATOR}split genre 2015")
+        expect(@subjects).to include("Fiction#{SEPARATOR}1492#{SEPARATOR}don't ignore TITLE")
+      end
+    end
+
+    describe 'when a vocabulary limit is provided' do
+      it 'excludes headings missing a subfield 2 or part of a different vocab' do
+        expect(@vocab_subjects).to eq []
+      end
+      it 'only includes the heading from a matching subfield 2 value' do
+        expect(@special_subjects).to eq ["John. Title#{SEPARATOR}split genre 2015"]
+      end
+    end
+  end
+
+  describe 'process_subject_topic_facet function' do
+    before(:all) do
+      @s600 = { '600' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'a' => 'John.' }, { 'x' => 'Join' }, { 't' => 'Title' }, { 'd' => '2015' }] } }
+      @s630 = { '630' => { 'ind1' => '', 'ind2' => '0', 'subfields' => [{ 'x' => 'Fiction' }, { 'y' => '1492' }, { 'z' => "don't ignore" }, { 'v' => 'split genre' }] } }
+      @sample_marc = MARC::Record.new_from_hash('fields' => [@s600, @s630])
+      @subjects = process_subject_topic_facet(@sample_marc, '600|*0|abcdfklmnopqrtvxyz:630|*0|adfgklmnoprstvxyz')
+    end
+
+    it 'trims punctuation' do
+      expect(@subjects).to include('John')
+    end
+
+    it 'includes subjects split along v, x, y and z' do
+      expect(@subjects).to include('Join Title 2015')
+      expect(@subjects).to include('1492')
+      expect(@subjects).to include('split genre')
+      expect(@subjects).to include('Fiction')
+      expect(@subjects).to include("don't ignore")
+    end
+  end
+end