Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bulk access to feedly with proxy #10

Open
wants to merge 3 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions feed_seeker/feed_seeker.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,8 @@ def guess_feed_links(self):

def find_feedly_feeds(self,
max_links : int = None,
throttle : int = 5):
throttle : int = 5,
proxy: str = None):
"""This is the class method for the find_feedly_feeds method below. Check out the
description there for more information on how to use the method
"""
Expand All @@ -397,7 +398,7 @@ def find_feedly_feeds(self,
params = {}
params['query'] = url
params['count'] = 500
response = requests.get(search_url,params=params)
response = requests.get(search_url,params=params, proxies=proxy)
if response.status_code == 200:
checked_queries.add(url)
feeds = response.json()
Expand Down Expand Up @@ -472,7 +473,8 @@ def generate_feed_urls(url, html=None, spider=0, max_time=None, max_links=None):

def find_feedly_feeds(url:str,
max_links : int = None,
throttle : int = 5) -> Iterable[str]:
throttle : int = 5,
proxy: str = None) -> Iterable[str]:
"""Use feedly to discover feeds
There are a few gotchas here. Sometimes searching with the top level domain
attached doesn't yield as many results (e.g. washingtonpost.com) -- however,
Expand All @@ -482,5 +484,5 @@ def find_feedly_feeds(url:str,
or other issues. The default throttle between requests is 5 seconds and can be
set using the throttle parameter.
"""
for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle):
for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle,proxy=proxy):
yield feed
66 changes: 66 additions & 0 deletions script/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import os
import sys
import os.path
import time
import pandas as pd
import config as cfg
from concurrent.futures import ThreadPoolExecutor, as_completed
import uuid
from feed_seeker import find_feedly_feeds

# script to find feeds for websites listed in file sources.csv using feedly.
# use a proxy service to access feedly since running multiple threads will cause an ip to get banned

# run feedly for a data source to find the feeds
def getFeed(source, output_dir, proxy):
retry_flag = True
retry_count = 0
retry_second = 3
retry_num = 8
while retry_flag and retry_count < retry_num:
try:
resp = find_feedly_feeds('https://' + source, proxy=proxy)
result = []
for feed in resp:
print(feed)
result.append(feed)
retry_flag = False
except Exception as e:
print(f"Retry after {retry_second*retry_count**2} seconds for {source} due to: {e}")
retry_count = retry_count + 1
time.sleep(retry_second * retry_count**2)
print(result)
df = pd.DataFrame(result)
df['source'] = source
df.to_csv(output_dir + '/' + source + '.csv')
return source

def get_feeds_for_sources(sources, output_dir, proxy):
threads= []
respl = []
with ThreadPoolExecutor(max_workers=20) as executor:
for source in sources:
if os.path.exists(output_dir + '/' + source + '.csv'):
print(f"skipping {source}")
continue
sys.stdout.flush()
file_name = uuid.uuid1()
threads.append(executor.submit(getFeed, source, output_dir, proxy))
for task in as_completed(threads):
symbol = task.result()


proxy = {}
# if using a proxy with feedly you should set https
#proxy = {
# 'http': '',
# 'https': '',
#}
output_dir="feeds"
if not os.path.exists(output_dir):
os.mkdir(output_dir)

# open the hostnames to get feeds for
df = pd.read_csv("source.csv")
sources = df.domain.values.tolist()
get_feeds_for_sources(sources, output_dir, proxy)
11 changes: 11 additions & 0 deletions script/source.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
domain,total
yahoo.com,2482877
benzinga.com,1315542
globenewswire.com,1127674
prnewswire.com,768912
indiatimes.com,728071
einpresswire.com,522015
finanznachrichten.de,512590
seekingalpha.com,382240
heraldodelbajio.com,269358
thenelsonpost.ca,256917