Skip to content

Commit

Permalink
drop broken FakeUserAgents lib and switch to relying on https://www.u…
Browse files Browse the repository at this point in the history
  • Loading branch information
boogheta committed Aug 21, 2023
1 parent cabae1a commit ff9af58
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 90 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ This work is supported by [DIME-Web](http://dimeweb.dime-shs.sciences-po.fr/), p

Hyphe is a free open source software released under [AGPL 3.0 license](LICENSE).

Thanks to [https://www.useragents.me](https://www.useragents.me) for maintaining a great updated list of common user agents which are reused within Hyphe!

<blockquote>
<i>[...] I hear _kainos_ [(greek: "now")] in the sense of thick, ongoing presence, with __hyphae__ infusing all sorts of temporalities and materialities."</i>

Expand Down
15 changes: 9 additions & 6 deletions hyphe_backend/core.tac
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@ from bson.binary import Binary
from random import randint
from datetime import datetime
from collections import defaultdict
import logging
logging.basicConfig()

from warnings import filterwarnings
filterwarnings(action='ignore', message="Python 2 is no longer supported by the Python core team")
from twisted.internet import reactor
import logging
logging.basicConfig()

from twisted.python import log as logger
from twisted.python.logfile import LogFile
from twisted.internet import reactor
from twisted.web import server
from twisted.application.internet import TCPServer
from twisted.application.service import Application
Expand All @@ -36,7 +38,7 @@ from hyphe_backend.lib.utils import *
from hyphe_backend.lib.config_hci import test_and_make_dir, check_conf_sanity, clean_missing_corpus_options, CORPUS_CONF_SCHEMA, DEFAULT_CORPUS, TEST_CORPUS
from hyphe_backend.lib.creationrules import getPreset as getWECR
from hyphe_backend.lib.webarchives import ARCHIVES_OPTIONS
from hyphe_backend.lib.user_agents import get_random_user_agent
from hyphe_backend.lib.user_agents import UserAgentsList
from hyphe_backend.lib.tlds import collect_tlds
from hyphe_backend.lib.jobsqueue import JobsQueue
from hyphe_backend.lib.mongo import MongoDB, sortasc, sortdesc
Expand All @@ -60,6 +62,7 @@ class Core(customJSONRPC):
self.corpora = {}
self.existing_corpora = set([])
self.destroying = {}
self.user_agents_list = UserAgentsList()
self.crawler = Crawler(self)
self.store = Memory_Structure(self)
reactor.callLater(0, self.jsonrpc_list_corpus)
Expand Down Expand Up @@ -1073,7 +1076,7 @@ class Core(customJSONRPC):
if tryout > 3:
method = "GET"
headers = {'Accept': ['*/*'],
'User-Agent': [get_random_user_agent()]}
'User-Agent': [self.user_agents_list.get_random()]}
response = yield agent.request(method, url, Headers(headers), None)
except (DNSLookupError, ConnectionRefusedError) as e:
if use_proxy and (proxy_host in str(e) or type(e) == ConnectionRefusedError):
Expand Down Expand Up @@ -1258,7 +1261,7 @@ class Crawler(customJSONRPC):
'discover_prefixes': list(follow_redirects),
'ignore_internal_links': self.corpora[corpus]["options"]["ignore_internal_links"],
'proxy': proxy,
'user_agent': get_random_user_agent(),
'user_agent': self.parent.user_agents_list.get_random(),
'cookies': cookies_string,
'webarchives': webarchives
}
Expand Down
110 changes: 49 additions & 61 deletions hyphe_backend/lib/user_agents.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,58 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Cached list of recent user-agents grabbed from https://www.useragents.me
# Update the fallback cache by running python hyphe_backend/lib/user_agents.py

import os
import sys
import random
from fake_useragent import UserAgent, FakeUserAgentError

USER_AGENTS_CLIENT = None
USER_AGENTS_LIST = []

# Initializing the UserAgent object if possible, otherwise the user_agets_list from the user_agents.txt file

directory = os.path.dirname(__file__)
path_to_file = os.path.join(directory,"user_agents.txt")

# Instantiation of the UserAgent object
try:
USER_AGENTS_CLIENT = UserAgent(cache=False)
except FakeUserAgentError as e:
print "Error when trying to instantiate a user-agent with FakeUserAgent: %s.\nSwitching to local list" % e
# Transcription of the local user_agents.txt into the USER_AGENTS_LIST
with open(path_to_file) as f:
USER_AGENTS_LIST = f.read().splitlines()

def get_random_user_agent():
"""Returns a random user agent not including IE ones"""

if USER_AGENTS_CLIENT is None:
random_user_agent = random.choice(USER_AGENTS_LIST)
else:
random_user_agent = "MSIE "
while "MSIE " in random_user_agent:
random_user_agent = USER_AGENTS_CLIENT.random

return random_user_agent

def update_user_agents_list():
"""Updates the local user_agents.txt file containing 100 user agents"""

directory = os.path.dirname(__file__)
path_to_file = os.path.join(directory,"user_agents.txt")

if USER_AGENTS_CLIENT is None:
print "Error when trying to update the user-agents list with FakeUserAgent"
sys.exit(1)

# Generating a new list of 100 user agents

new_user_agents_set = set() # Using a set avoids duplicates
while len(new_user_agents_set) < 100:
ua = USER_AGENTS_CLIENT.random
if "MSIE " in ua:
continue
new_user_agents_set.add(ua)
new_user_agents_list = sorted(new_user_agents_set)

print "List of user agents successfully generated"

# Storing the list into user_agents.txt
import requests

class UserAgentsList(object):

def __init__(self, agents_list=[], cache_file=None, read_cache=True):
self.list = agents_list
self.cache = cache_file or os.path.join(os.path.dirname(__file__), "user_agents.txt")

# Initiate with latest list of UserAgents or fallback with local list
if not self.list:
self.download_latest()
if not self.list:
self.read_cache()

def download_latest(self):
try:
json_list = requests.get("https://www.useragents.me/api").json()
self.list = [
ua["ua"]
for ua in json_list.get("data")
if not "Trident" in ua["ua"] or "MSIE " in ua["ua"]
]
except Exception as e:
print "WARNING: could not download latest UserAgents list from https://www.useragents.me ; will use a local cached list: %s - %s" % (type(e), e)

def read_cache(self):
try:
with open(self.cache) as f:
self.list = f.read().splitlines()
except Exception as e:
print "ERROR: could not read cached list of user agents in file %s: %s - %s" % (self.cache, type(e), e)

def write_cache(self):
try:
with open(self.cache, "w") as user_agents_file:
for user_agent in self.list:
print >> user_agents_file, user_agent
except Exception as e:
print "ERROR: could not write list of user agents in cache file %s: %s - %s" % (self.cache, type(e), e)

def get_random(self):
"""Returns a random user agent not including IE or Trident ones"""
return random.choice(self.list)

try:
with open(path_to_file, "w") as user_agents_file:
for user_agent in new_user_agents_list:
print >> user_agents_file, user_agent
except:
print "Error writing in user_agents.txt"

if __name__ == "__main__":
update_user_agents_list()
# Updates the local user_agents.txt backup file
ua_list = UserAgentsList(read_cache=False)
ua_list.write_cache()
65 changes: 43 additions & 22 deletions hyphe_backend/lib/user_agents.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,43 @@
Mozilla/5.0 (Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (Macintosh; Intel Mac OS X 12.3; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (Macintosh; Intel Mac OS X 12.3; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/99.0.1150.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 12_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15
Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/99.0.1150.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (X11; Fedora; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (X11; Linux i686; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (X11; Linux i686; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36
Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:99.0) Gecko/20100101 Firefox/99.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.57
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Safari/605.1.15
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36
Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.41
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.56
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Whale/3.19.166.16 Safari/537.36
Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.46
Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.192.400 QQBrowser/11.5.5250.400
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36
Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 OPR/95.0.0.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0
Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36
Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763
Mozilla/5.0 (X11; Linux x86_64; rv:108.0) Gecko/20100101 Firefox/108.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36
Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63
Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.54
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.61
Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/110.0
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.70
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ progressbar2
service_identity
virtualenvwrapper
urllib3>=1.26.9,<2
fake-useragent==0.1.11
hyphe-traph>=2.1.0,<3
incremental==21.3.0
msgpack-python>=0.3
requests==2.27.1
Scrapy==1.6.0
scrapyd-client==1.2.0a1
selenium==2.42.1
Expand Down

0 comments on commit ff9af58

Please sign in to comment.