Skip to content

Commit

Permalink
FIX: use crawler layout when saving url in Wayback Machine (discourse…
Browse files Browse the repository at this point in the history
  • Loading branch information
majakomel authored and SamSaffron committed Jun 3, 2019
1 parent 28dcf44 commit 42809f4
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 30 deletions.
4 changes: 3 additions & 1 deletion app/controllers/application_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ def use_crawler_layout?
request.user_agent &&
(request.content_type.blank? || request.content_type.include?('html')) &&
!['json', 'rss'].include?(params[:format]) &&
(has_escaped_fragment? || CrawlerDetection.crawler?(request.user_agent) || params.key?("print"))
(has_escaped_fragment? || params.key?("print") ||
CrawlerDetection.crawler?(request.user_agent, request.headers["HTTP_VIA"])
)
end

def perform_refresh_session
Expand Down
5 changes: 3 additions & 2 deletions lib/crawler_detection.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# frozen_string_literal: true

module CrawlerDetection
WAYBACK_MACHINE_URL = "web.archive.org"

def self.to_matcher(string, type: nil)
escaped = string.split('|').map { |agent| Regexp.escape(agent) }.join('|')
Expand All @@ -13,8 +14,8 @@ def self.to_matcher(string, type: nil)
Regexp.new(escaped, Regexp::IGNORECASE)
end

def self.crawler?(user_agent)
return true if user_agent.nil?
def self.crawler?(user_agent, via_header = nil)
return true if user_agent.nil? || via_header&.include?(WAYBACK_MACHINE_URL)

# this is done to avoid regenerating regexes
@non_crawler_matchers ||= {}
Expand Down
2 changes: 1 addition & 1 deletion lib/middleware/anonymous_cache.rb
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def is_crawler?
@is_crawler ||=
begin
user_agent = @env[USER_AGENT]
if CrawlerDetection.crawler?(user_agent)
if CrawlerDetection.crawler?(user_agent, @env["HTTP_VIA"])
:true
else
user_agent.downcase.include?("discourse") ? :true : :false
Expand Down
10 changes: 7 additions & 3 deletions spec/components/crawler_detection_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@

describe CrawlerDetection do

def crawler!(s)
if (!CrawlerDetection.crawler?(s))
raise "#{s} should be a crawler!"
def crawler!(user_agent, via = nil)
if (!CrawlerDetection.crawler?(user_agent, via))
raise "#{user_agent} should be a crawler!"
end
end

Expand Down Expand Up @@ -50,6 +50,10 @@ def not_crawler!(s)
crawler! "Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)"
end

it "returns true when VIA header contains 'web.archive.org'" do
crawler!("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)")
end

it "returns false for non-crawler user agents" do
not_crawler! "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
not_crawler! "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"
Expand Down
50 changes: 27 additions & 23 deletions spec/requests/topics_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2690,15 +2690,14 @@ def invite_group(topic, expected_status)
end

context "when a crawler" do
it "renders with the crawler layout, and handles proper pagination" do

page1_time = 3.months.ago
page2_time = 2.months.ago
page3_time = 1.month.ago
let(:topic) { Fabricate(:topic) }
let(:page1_time) { 3.months.ago }
let(:page2_time) { 2.months.ago }
let(:page3_time) { 1.month.ago }

before do
freeze_time page1_time

topic = Fabricate(:topic)
Fabricate(:post, topic: topic)
Fabricate(:post, topic: topic)

Expand All @@ -2712,33 +2711,38 @@ def invite_group(topic, expected_status)
# ugly, but no inteface to set this and we don't want to create
# 100 posts to test this thing
TopicView.stubs(:chunk_size).returns(2)
end

user_agent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"

get topic.url, env: { "HTTP_USER_AGENT" => user_agent }
shared_examples "crawler layout" do |user_agent, via|
it "renders with the crawler layout, and handles proper pagination" do
get topic.url, env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }

body = response.body
body = response.body

expect(body).to have_tag(:body, with: { class: 'crawler' })
expect(body).to_not have_tag(:meta, with: { name: 'fragment' })
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=2")
expect(body).to have_tag(:body, with: { class: 'crawler' })
expect(body).to_not have_tag(:meta, with: { name: 'fragment' })
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=2")

expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)
expect(response.headers['Last-Modified']).to eq(page1_time.httpdate)

get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent }
body = response.body
get topic.url + "?page=2", env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
body = response.body

expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)
expect(response.headers['Last-Modified']).to eq(page2_time.httpdate)

expect(body).to include('<link rel="prev" href="' + topic.relative_url)
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")
expect(body).to include('<link rel="prev" href="' + topic.relative_url)
expect(body).to include('<link rel="next" href="' + topic.relative_url + "?page=3")

get topic.url + "?page=3", env: { "HTTP_USER_AGENT" => user_agent }
body = response.body
get topic.url + "?page=3", env: { "HTTP_USER_AGENT" => user_agent, "HTTP_VIA" => via }
body = response.body

expect(response.headers['Last-Modified']).to eq(page3_time.httpdate)
expect(body).to include('<link rel="prev" href="' + topic.relative_url + "?page=2")
expect(response.headers['Last-Modified']).to eq(page3_time.httpdate)
expect(body).to include('<link rel="prev" href="' + topic.relative_url + "?page=2")
end
end

include_examples "crawler layout", "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", nil
include_examples "crawler layout", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "HTTP/1.0 web.archive.org (Wayback Save Page)"
end

end
Expand Down

0 comments on commit 42809f4

Please sign in to comment.