Skip to content

Commit

Permalink
Handle wider range of MRSS schemas
Browse files Browse the repository at this point in the history
- Randomized specs (and fixed resulting problems)
- Split MRSS parser classes into two files

Close #3
  • Loading branch information
loren committed Nov 5, 2014
1 parent e21581d commit 8c8f5bd
Show file tree
Hide file tree
Showing 17 changed files with 265 additions and 98 deletions.
1 change: 1 addition & 0 deletions .rspec
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
--color
--require spec_helper
--order rand
24 changes: 24 additions & 0 deletions app/parsers/mrss.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
module Feedjira
module Parser
module Oasis
class Mrss
include SAXMachine
include FeedUtilities

element :title
element :link
element :description

elements :item, :as => :entries, :class => Oasis::MrssEntry

attr_accessor :feed_url

REGEX_MATCH = %r(http://purl.org/rss/1.0/modules/content/|http://search.yahoo.com/mrss/)

def self.able_to_parse?(first_2k_xml)
first_2k_xml =~ REGEX_MATCH
end
end
end
end
end
45 changes: 45 additions & 0 deletions app/parsers/mrss_entry.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
module Feedjira
module Parser
module Oasis
class MrssEntry
include SAXMachine
include FeedEntryUtilities

element :guid, :as => :entry_id
element :'dc:identifier', :as => :entry_id

element :title

element :link, :as => :url

element :pubDate, :as => :published
element :pubdate, :as => :published
element :'dc:date', :as => :published
element :'dc:Date', :as => :published
element :'dcterms:created', :as => :published
element :issued, :as => :published

element 'media:thumbnail', :value => :url, :as => :thumbnail_url

element :description, :as => :summary
element 'media:description', :as => :summary
element 'content:encoded', :as => :summary

def title
sanitize @title
end

def summary
sanitize @summary
end

private

def sanitize(unsafe_html)
doc = Loofah.fragment(unsafe_html)
doc.text.strip.squish
end
end
end
end
end
42 changes: 0 additions & 42 deletions app/parsers/mrss_parser.rb

This file was deleted.

3 changes: 2 additions & 1 deletion config/initializers/feedjira.rb
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
require Rails.root.join('app','parsers','mrss_parser.rb')
require Rails.root.join('app','parsers','mrss_entry.rb')
require Rails.root.join('app','parsers','mrss.rb')
Feedjira::Feed.add_feed_class Feedjira::Parser::Oasis::Mrss
1 change: 1 addition & 0 deletions spec/models/album_detection_photo_iterator_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

describe AlbumDetectionPhotoIterator, "run" do
before do
FlickrPhoto.delete_all
5.times do |x|
i = x + 1
FlickrPhoto.create(id: "photo #{i}", owner: "owner1", tags: ['alpha', 'bravo', 'charlie', i.ordinalize],
Expand Down
5 changes: 5 additions & 0 deletions spec/models/image_search_spec.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
require 'rails_helper'

describe ImageSearch do
before do
FlickrPhoto.delete_all
InstagramPhoto.delete_all
MrssPhoto.delete_all
end

context 'when relevant results exist in Instagram, Flickr, and MRSS indexes' do
before do
Expand Down
62 changes: 62 additions & 0 deletions spec/parsers/mrss_entry_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
require 'rails_helper'

describe Feedjira::Parser::Oasis::MrssEntry do
context 'when entry has media:thumbnail and media:description' do
let(:entry) do
dma_mrss_xml = File.read(Rails.root.to_s + '/spec/sample_feeds/dma.xml')
feed = Feedjira::Feed.parse(dma_mrss_xml)
feed.entries.first
end

describe 'a parsed entry' do
it 'should have the correct title stripped and squished' do
expect(entry.title).to eq("")
end
it 'should have the correct summary stripped and squished' do
expect(entry.summary).to eq("Official Photo- of something important (U.S. Air Force Photo)")
end
it 'should have the correct url' do
expect(entry.url).to eq("http://www.af.mil/News/Photos.aspx?igphoto=2000949217")
end
it 'should have the correct thumbnail url' do
expect(entry.thumbnail_url).to eq("http://media.dma.mil/2014/Oct/22/2000949217/145/100/0/141022-F-PB123-223.JPG")
end
it 'should have the correct entry_id' do
expect(entry.entry_id).to eq("http://www.af.mil/News/Photos.aspx?igphoto=2000949217")
end
it 'should have the correct published time' do
expect(entry.published).to eq(Time.parse("2014-10-22 14:24:00Z"))
end
end
end

context 'when entry has description and media:description' do
let(:entries) do
mrss_xml = File.read(Rails.root.to_s + '/spec/sample_feeds/desc_plus_mediadesc.xml')
feed = Feedjira::Feed.parse(mrss_xml)
feed.entries
end

describe 'a parsed entry' do
it 'should use whatever comes last in the XML' do
expect(entries.first.summary).to eq("This came from description")
expect(entries.last.summary).to eq("But this came from media:description")
end
end
end

context 'when the feed uses RSS content module' do
let(:entry) do
mrss_xml = File.read(Rails.root.to_s + '/spec/sample_feeds/rss_with_content_module.xml')
feed = Feedjira::Feed.parse(mrss_xml)
feed.entries.first
end

describe 'a parsed entry' do
it 'should use the content:encoded field for the summary' do
expect(entry.summary).to eq("Sentence one. Sentence two. more...")
end
end
end

end
49 changes: 0 additions & 49 deletions spec/parsers/mrss_parser_spec.rb

This file was deleted.

42 changes: 42 additions & 0 deletions spec/parsers/mrss_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
require 'rails_helper'

describe Feedjira::Parser::Oasis::Mrss do
context 'for DMA feed' do
let(:dma_mrss_xml) { File.read(Rails.root.to_s + '/spec/sample_feeds/dma.xml') }

describe '#able_to_parse?' do
context 'when first 2000 chars of XML contains MRSS text string' do
it 'should return true' do
expect(Feedjira::Parser::Oasis::Mrss.able_to_parse?(dma_mrss_xml)).to be_truthy
end
end
end

describe 'the parser' do
it 'should pull out the entries properly' do
feed = Feedjira::Feed.parse(dma_mrss_xml)
expect(feed.entries.first.class).to eq(Feedjira::Parser::Oasis::MrssEntry)
end
end
end

context 'for RSS with content module' do
let(:mrss_xml) { File.read(Rails.root.to_s + '/spec/sample_feeds/rss_with_content_module.xml') }

describe '#able_to_parse?' do
context 'when first 2000 chars of XML contains the content namespace text string' do
it 'should return true' do
expect(Feedjira::Parser::Oasis::Mrss.able_to_parse?(mrss_xml)).to be_truthy
end
end
end

describe 'the parser' do
it 'should pull out the entries properly' do
feed = Feedjira::Feed.parse(mrss_xml)
expect(feed.entries.first.class).to eq(Feedjira::Parser::Oasis::MrssEntry)
end
end
end

end
1 change: 1 addition & 0 deletions spec/requests/api/v1/flickr_profiles_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
describe "GET /api/v1/flickr_profiles" do
context 'when profiles exist' do
before do
FlickrProfile.delete_all
FlickrProfile.create(name: 'profile2', id: '2', profile_type: 'group')
FlickrProfile.create(name: 'profile1', id: '1', profile_type: 'user')
FlickrProfile.refresh_index!
Expand Down
1 change: 1 addition & 0 deletions spec/requests/api/v1/instagram_profiles_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
describe "GET /api/v1/instagram_profiles" do
context 'when profiles exist' do
before do
InstagramProfile.delete_all
InstagramProfile.create(username: 'profile2', id: '2')
InstagramProfile.create(username: 'profile1', id: '1')
InstagramProfile.refresh_index!
Expand Down
38 changes: 38 additions & 0 deletions spec/sample_feeds/desc_plus_mediadesc.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<?xml version="1.0" encoding="utf-8"?>
<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<atom:link href="http://media.dma.mil/mrss/portal/144/detailpage/www.af.mil/News/Photos.aspx" rel="self"
type="application/rss+xml"/>
<title>Air Force Link Images</title>
<link>http://www.af.mil</link>
<description>The latest images from Air Force Link.</description>
<item>
<title type="html">
<![CDATA[ ]]>
</title>
<link>http://www.af.mil/News/Photos.aspx?igphoto=2000949217</link>
<guid>http://www.af.mil/News/Photos.aspx?igphoto=2000949217</guid>
<pubDate>Wed, 22 Oct 2014 14:24:00 GMT</pubDate>
<media:thumbnail url="http://media.dma.mil/2014/Oct/22/2000949217/145/100/0/141022-F-PB123-223.JPG" width="72"
height="100"/>
<media:content url="http://media.dma.mil/2014/Oct/22/2000949217/-1/-1/0/141022-F-PB123-223.JPG" width="1500"
height="2100"/>
<media:description>This came from media:description</media:description>
<description>This came from description</description>
</item>
<item>
<title type="html">
<![CDATA[ ]]>
</title>
<link>http://www.af.mil/News/Photos.aspx?igphoto=2000949218</link>
<guid>http://www.af.mil/News/Photos.aspx?igphoto=2000949218</guid>
<pubDate>Wed, 22 Oct 2014 14:24:00 GMT</pubDate>
<media:thumbnail url="http://media.dma.mil/2014/Oct/22/2000949218/145/100/0/141022-F-PB123-223.JPG" width="72"
height="100"/>
<media:content url="http://media.dma.mil/2014/Oct/22/2000949218/-1/-1/0/141022-F-PB123-223.JPG" width="1500"
height="2100"/>
<description>But this came from description</description>
<media:description>But this came from <!--Here is a comment --> media:description</media:description>
</item>
</channel>
</rss>
37 changes: 37 additions & 0 deletions spec/sample_feeds/rss_with_content_module.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/">
<channel>
<title><![CDATA[Joint Base San Antonio - Commentaries]]></title>
<link>http://www.jbsa.af.mil</link>
<description><![CDATA[Joint Base San Antonio - Commentaries]]></description>
<language>en-US</language>
<copyright><![CDATA[2014 Joint Base San Antonio]]></copyright>
<pubDate>Tue, 30 Sep 2014 17:25:16 GMT</pubDate>
<lastBuildDate>Tue, 30 Sep 2014 17:25:16 GMT</lastBuildDate>
<generator>Air Force Link RSS Generator</generator>
<item>
<title><![CDATA[Celebrating National Hispanic Heritage Month]]></title>
<link>http://www.jbsa.af.mil/news/story.asp?id=123426145</link>
<content:encoded><![CDATA[Sentence one.<br />
<br />
Sentence two.<br />
<br />
<a href="http://www.jbsa.af.mil/news/story.asp?id=123426145">more...</a>]]></content:encoded>
<author>[email protected] (Maj. Gen. Jimmie O. Keenan)</author>
<guid>http://www.jbsa.af.mil/news/story.asp?id=123426145</guid>
<pubDate>Thu, 25 Sep 2014 15:58:24 EST</pubDate>
</item>
<item>
<title><![CDATA[Beyond 360 feed back is 360 accountability]]></title>
<link>http://www.jbsa.af.mil/news/story.asp?id=123422070</link>
<content:encoded><![CDATA[<div style="float:left;"><a href="http://www.jbsa.af.mil/news/story.asp?id=123422070"><img border="0" style="margin-right:15px" src="http://www.jbsa.af.mil/shared/media/photodb/thumbnails/2014/06/140617-F-XX000-002.jpg"</img></a></div><font size="3"><font face="Times New Roman">In highly accomplished teams and organizations, every member is accountable for their performance - whether hitting a baseball or flying an airplane.<o:p></o:p></font></font><font face="Times New Roman" size="3"> </font>
<p class="MsoNormal" style="margin: 0in 0in 0pt;">&#160;
<p class="MsoNormal" style="margin: 0in 0in 0pt;"><font size="3"><font face="Times New Roman">That is why in Air Force Operations, whether flying or defending, controlling or building, we debrief the mission, compare our performance to standards, and develop learning points to improve the next mission. In that debrief, everyone is held to equal account according to the standards of their job, whether they are O-5 or E-3, commander or wingman. In the mission debrief, we have 360-degree accountability.<o:p></o:p></font></font> <font face="Times New Roman" size="3"> </font>
<p class="MsoNormal" style="margin: 0in 0in 0pt;">&#160;
<p class="MsoNormal" style="margin: 0<br/><a href="http://www.jbsa.af.mil/news/story.asp?id=123422070">more...</a>]]></content:encoded>
<author>[email protected] (Col. Matt Isler )</author>
<guid>http://www.jbsa.af.mil/news/story.asp?id=123422070</guid>
<pubDate>Thu, 21 Aug 2014 12:15:00 EST</pubDate>
</item>
</channel>
</rss>
Loading

0 comments on commit 8c8f5bd

Please sign in to comment.