This repository has been archived by the owner on May 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrodeo_spec.rb
104 lines (88 loc) · 3.91 KB
/
rodeo_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# frozen_string_literal: true
require 'spec_helper'
RSpec.describe Derivative::Rodeo do
describe '.config' do
subject { described_class.config }
it { is_expected.to be_a described_class::Configuration }
it "yields a Configuration" do
expect { |b| described_class.config(&b) }.to yield_with_args(kind_of(described_class::Configuration))
end
end
let(:config) { Fixtures.config }
describe '.process_derivative' do
let(:arena) { Fixtures.arena(manifest: manifest) }
let(:manifest) do
Fixtures.manifest(
work_identifier: work_identifier,
file_set_filename: file_set_filename,
derivatives: derivatives,
mime_type: mime_type,
path_to_original: path_to_original
)
end
subject { described_class.process_derivative(json: arena.to_json, config: config) }
context "with a 2 page color PDF" do
let(:file_set_filename) { "sample-color-newsletter.pdf" }
let(:work_identifier) { "with-original-only" }
let(:derivatives) { [:split_pdf] }
let(:mime_type) { "application/pdf" }
let(:path_to_original) { Fixtures.path_for(file_set_filename) }
it "splits the pages into images and extracts text" do
# TODO: verify that we have two (or four because of monochrome) images on the file system
# TODO: verify that we have two hocr files on the file system
# TODO: this still needs ALTO incorporated
subject
end
end
context "with a color image" do
# Yes these are the defaults, but I'd rather be explicit in what we're doing.
let(:work_identifier) { 'parent-identifier' }
let(:file_set_filename) { 'ocr_color.tiff' }
let(:derivatives) { { monochrome: Fixtures.path_for('ocr_gray.tiff') } }
let(:mime_type) { "image/tiff" }
let(:path_to_original) { Fixtures.path_for(file_set_filename) }
it "runs the pre-processing and mime step processing" do
expect do
expect do
expect do
subject
end.to change { arena.local_storage.exists?(derivative: :base_file_for_chain) }.from(false).to(true)
end.to change { arena.local_storage.exists?(derivative: :monochrome) }.from(false).to(true)
end.to change { arena.local_storage.exists?(derivative: :hocr) }.from(false).to(true)
end
end
context 'with a remote URL for the original' do
let(:work_identifier) { 'parent-identifier' }
let(:file_set_filename) { 'ocr_color.tiff' }
let(:derivatives) { { monochrome: Fixtures.path_for('ocr_gray.tiff') } }
let(:mime_type) { "image/tiff" }
let(:path_to_original) { "https://takeonrules.com/" }
let(:original_content) { "Hello World\nNice to See You!\n" }
it 'downloads that original file' do
# Intercept these calls
allow(Derivative::Rodeo::Utilities::Url).to receive(:read).with(path_to_original).and_return(original_content)
allow(Derivative::Rodeo::Utilities::Url).to receive(:exists?).with(path_to_original).and_return(true)
expect do
subject
end.to change { arena.local_storage.exists?(derivative: :base_file_for_chain) }.from(false).to(true)
expect(File.read(arena.local_path(derivative: :base_file_for_chain))).to eq(original_content)
end
end
context 'with a JPG'
context 'with a PNG'
context 'with a MOV'
context 'with a WAV'
end
# ADL: They have Reader PDF, TXT, Thumbnail, Archival PDF
describe '.process_file_sets_from_csv' do
let(:csv) do
CSV.generate do |csv|
csv << ["work_identifier", "file_set_filename", "path_to_original", "monochrome", "mime_type"]
csv << ["123", "ocr_color.tiff", Fixtures.path_for("ocr_color.tiff"), Fixtures.path_for("ocr_mono.tiff"), nil]
end
end
it "calls and enqueue the entire derivative chain for each provided manifest" do
described_class.process_file_sets_from_csv(csv, config: config)
end
end
end