Skip to content

Commit

Permalink
Create hierarchical dates for opengeometadata records
Browse files Browse the repository at this point in the history
Closes #1407
  • Loading branch information
thatbudakguy committed Sep 12, 2024
1 parent ce3411a commit 8b39c44
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 4 deletions.
46 changes: 45 additions & 1 deletion lib/earthworks/harvester.rb
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,16 @@ def transform_record(record, path)
record.update({ 'schema_provider_s' => transformed_provider })
end

# Filter out themes that are not in the controlled vocabulary from OGM
if (themes = record['dcat_theme_sm'])
# Filter out themes that are not in the controlled vocabulary from OGM
record.update({ 'dcat_theme_sm' => themes.select { |theme| Settings.ALLOWED_OGM_THEMES.include?(theme) } })
end

# Add a hierarchicalized version of the dates for the year facet
if (years = record['gbl_indexYear_im'])
record.update({ 'date_hierarchy_sm' => hierarchicalize_year_list(years) })
end

record
end

Expand All @@ -63,5 +69,43 @@ def record_repo(path)
def repositories
@repositories ||= @ogm_repos ? super.compact.select { |repo| @ogm_repos.key?(repo) } : super
end

# NOTE: this duplicates logic in searchworks_traject_indexer, see:
# https://github.com/sul-dlss/searchworks_traject_indexer/pull/1521
def hierarchicalize_year_list(years)
centuries = Set.new
decades = Set.new

hierarchicalized_years = years.map do |year|
century, decade = centimate_and_decimate(year)
centuries << century
decades << [century, decade].join(':')
[century, decade, year].join(':')
end

centuries.to_a + decades.to_a + hierarchicalized_years
end

# NOTE: this duplicates logic in searchworks_traject_indexer, see:
# https://github.com/sul-dlss/searchworks_traject_indexer/pull/1521
def centimate_and_decimate(maybe_year)
parsed_date = Date.new(maybe_year.to_i)
[century_from_date(parsed_date), decade_from_date(parsed_date)]
rescue Date::Error
%w[unknown_century unknown_decade] # guess not
end

# NOTE: this duplicates logic in searchworks_traject_indexer, see:
# https://github.com/sul-dlss/searchworks_traject_indexer/pull/1521
def century_from_date(date)
date.strftime('%C00-%C99')
end

# NOTE: this duplicates logic in searchworks_traject_indexer, see:
# https://github.com/sul-dlss/searchworks_traject_indexer/pull/1521
def decade_from_date(date)
decade_prefix = (date.strftime('%Y').to_i / 10).to_s
"#{decade_prefix}0-#{decade_prefix}9"
end
end
end
36 changes: 33 additions & 3 deletions spec/lib/earthworks/harvester_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,31 @@

describe '#docs_to_index' do
# Provenance value will be transformed by our ogm_repos config
let(:psu_doc) { { schema_provider_s: 'Pennsylvania State University', gbl_mdVersion_s: 'Aardvark' }.to_json }
let(:psu_doc) do
{
schema_provider_s: 'Pennsylvania State University',
gbl_mdVersion_s: 'Aardvark',
gbl_indexYear_im: [1701, 1980, 1991, 1995]
}.to_json
end
let(:psu_path) { "#{ogm_path}/edu.psu/metadata-aardvark/Maps/08d-01/geoblacklight.json" }

# PolicyMap records have placeholder data and should be skipped
let(:policymap_doc) { { schema_provider_s: 'Geoblacklight', gbl_mdVersion_s: 'Aardvark' }.to_json }
let(:policymap_doc) do
{
schema_provider_s: 'Geoblacklight',
gbl_mdVersion_s: 'Aardvark'
}.to_json
end
let(:policymap_path) { "#{ogm_path}/shared-repository/gbl-policymap/records/geoblacklight.json" }

let(:psu_doc2_hash) { { schema_provider_s: 'Pennsylvania State University', gbl_mdVersion_s: 'Aardvark' } }
let(:psu_doc2_hash) do
{
schema_provider_s: 'Pennsylvania State University',
gbl_mdVersion_s: 'Aardvark',
gbl_indexYear_im: [1538]
}
end
let(:psu_doc2) { psu_doc2_hash.to_json }
let(:psu_path2) { "#{ogm_path}/edu.psu/metadata-aardvark/Maps/08d-01/geoblacklight_2.json" }

Expand Down Expand Up @@ -101,6 +118,19 @@
expect(docs.first.first['schema_provider_s']).to eq('Penn State')
end

it 'generates hierarchical years for the year facet' do
docs = harvester.docs_to_index.to_a
expect(docs.first.first['date_hierarchy_sm']).to contain_exactly(
'1700-1799', '1900-1999',
'1700-1799:1700-1709', '1900-1999:1980-1989', '1900-1999:1990-1999',
'1700-1799:1700-1709:1701', '1900-1999:1980-1989:1980', '1900-1999:1990-1999:1991', '1900-1999:1990-1999:1995'
)
expect(docs.last.first['date_hierarchy_sm']).to contain_exactly(
'1500-1599', '1500-1599:1530-1539',
'1500-1599:1530-1539:1538'
)
end

context 'when record contains themes outside the controlled vocabulary' do
let(:psu_doc) do
psu_doc2_hash.merge({ dcat_theme_sm: %w[Agriculture Biota Farming] }).to_json
Expand Down

0 comments on commit 8b39c44

Please sign in to comment.