From 7e142762ca4bced0d5155052bcd4e694ff4cd24e Mon Sep 17 00:00:00 2001 From: Chris Merrett Date: Thu, 19 Jun 2014 11:44:21 +0100 Subject: [PATCH 1/3] Added the MJ12Bot to the bad list This crawler is supposed to be respectful. Unfortunately it ties Magento's core_url_rewrite table in knots by hammering non-existent links with non-existant parameters. More to the point, this crawler exists only so that Majestic SEO can sell the data it generates. It'll never be of any use to anyone that doesn't hire them, so blocking it is fine. --- .htaccess | 1 + 1 file changed, 1 insertion(+) diff --git a/.htaccess b/.htaccess index 187b1e1..1b06743 100644 --- a/.htaccess +++ b/.htaccess @@ -108,6 +108,7 @@ SetEnvIfNoCase User-Agent "^MIIxpc" bad_bot SetEnvIfNoCase User-Agent "^Mirror" bad_bot SetEnvIfNoCase User-Agent "^Missigua\ Locator" bad_bot SetEnvIfNoCase User-Agent "^Mister\ PiX" bad_bot +SetEnvIfNoCase User-Agent "^MJ12bot" bad_bot SetEnvIfNoCase User-Agent "^moget" bad_bot SetEnvIfNoCase User-Agent "^Mozilla/3.Mozilla/2.01" bad_bot SetEnvIfNoCase User-Agent "^Mozilla.*NEWT" bad_bot From 2a1a49e8edf18d61dd02450b095410fb6f6dbc95 Mon Sep 17 00:00:00 2001 From: Chris Merrett Date: Thu, 19 Jun 2014 11:51:52 +0100 Subject: [PATCH 2/3] Added AhrefsBot and removed leading carat from MJ12Bot I waited 3 weeks for AhrefsBot to honor a robots.txt. Gave up and blocked it. I removed the carat from MJ12Bot. I mean, who *else* would use that in their user agent string? --- .htaccess | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.htaccess b/.htaccess index 1b06743..adfb6c1 100644 --- a/.htaccess +++ b/.htaccess @@ -3,6 +3,7 @@ RewriteEngine on # Block Bad Bots & Scrapers SetEnvIfNoCase User-Agent "Aboundex" bad_bot +SetEnvIfNoCase User-Agent "AhrefsBot" bad_bot SetEnvIfNoCase User-Agent "80legs" bad_bot SetEnvIfNoCase User-Agent "360Spider" bad_bot SetEnvIfNoCase User-Agent "^Java" bad_bot @@ -108,7 +109,7 @@ SetEnvIfNoCase User-Agent "^MIIxpc" bad_bot SetEnvIfNoCase User-Agent "^Mirror" bad_bot SetEnvIfNoCase User-Agent "^Missigua\ Locator" bad_bot SetEnvIfNoCase User-Agent "^Mister\ PiX" bad_bot -SetEnvIfNoCase User-Agent "^MJ12bot" bad_bot +SetEnvIfNoCase User-Agent "MJ12bot" bad_bot SetEnvIfNoCase User-Agent "^moget" bad_bot SetEnvIfNoCase User-Agent "^Mozilla/3.Mozilla/2.01" bad_bot SetEnvIfNoCase User-Agent "^Mozilla.*NEWT" bad_bot From 0276f86277fc96c4e4b11c05f389dba62fac2374 Mon Sep 17 00:00:00 2001 From: Chris Merrett Date: Thu, 19 Jun 2014 12:04:21 +0100 Subject: [PATCH 3/3] Added BLEXBot for not honoring robots.txt --- .htaccess | 1 + 1 file changed, 1 insertion(+) diff --git a/.htaccess b/.htaccess index adfb6c1..7ad4080 100644 --- a/.htaccess +++ b/.htaccess @@ -18,6 +18,7 @@ SetEnvIfNoCase User-Agent "^BatchFTP" bad_bot SetEnvIfNoCase User-Agent "^Bigfoot" bad_bot SetEnvIfNoCase User-Agent "^Black.Hole" bad_bot SetEnvIfNoCase User-Agent "^BlackWidow" bad_bot +SetEnvIfNoCase User-Agent "BLEXBot" bad_bot SetEnvIfNoCase User-Agent "^BlowFish" bad_bot SetEnvIfNoCase User-Agent "^BotALot" bad_bot SetEnvIfNoCase User-Agent "Buddy" bad_bot