From 7c5c84690feaee55e5a07d92a556a2f143e773fd Mon Sep 17 00:00:00 2001 From: Michael Shamberger Date: Mon, 27 Feb 2023 07:59:15 +0200 Subject: [PATCH 1/3] add proxy --- feed_seeker/feed_seeker.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/feed_seeker/feed_seeker.py b/feed_seeker/feed_seeker.py index cc567f6..eedb4a5 100755 --- a/feed_seeker/feed_seeker.py +++ b/feed_seeker/feed_seeker.py @@ -372,7 +372,8 @@ def guess_feed_links(self): def find_feedly_feeds(self, max_links : int = None, - throttle : int = 5): + throttle : int = 5, + proxy: str = None): """This is the class method for the find_feedly_feeds method below. Check out the description there for more information on how to use the method """ @@ -397,7 +398,7 @@ def find_feedly_feeds(self, params = {} params['query'] = url params['count'] = 500 - response = requests.get(search_url,params=params) + response = requests.get(search_url,params=params, proxies=proxy) if response.status_code == 200: checked_queries.add(url) feeds = response.json() @@ -472,7 +473,8 @@ def generate_feed_urls(url, html=None, spider=0, max_time=None, max_links=None): def find_feedly_feeds(url:str, max_links : int = None, - throttle : int = 5) -> Iterable[str]: + throttle : int = 5, + proxy: str = None) -> Iterable[str]: """Use feedly to discover feeds There are a few gotchas here. Sometimes searching with the top level domain attached doesn't yield as many results (e.g. washingtonpost.com) -- however, @@ -482,5 +484,5 @@ def find_feedly_feeds(url:str, or other issues. The default throttle between requests is 5 seconds and can be set using the throttle parameter. """ - for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle): + for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle,proxy=proxy): yield feed From aeab8bd834861b732137a85c475c998a4b28b4d2 Mon Sep 17 00:00:00 2001 From: Michael Shamberger Date: Mon, 27 Feb 2023 08:08:00 +0200 Subject: [PATCH 2/3] scripts for accessing feedly in bulk --- script/feed.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++ script/source.csv | 11 ++++++++ 2 files changed, 77 insertions(+) create mode 100644 script/feed.py create mode 100644 script/source.csv diff --git a/script/feed.py b/script/feed.py new file mode 100644 index 0000000..ce5856d --- /dev/null +++ b/script/feed.py @@ -0,0 +1,66 @@ +import os +import sys +import os.path +import time +import pandas as pd +import config as cfg +from concurrent.futures import ThreadPoolExecutor, as_completed +import uuid +from feed_seeker import find_feedly_feeds + +# script to find feeds for websites listed in file sources.csv using feedly. +# use a proxy service to access feedly since running multiple threads will cause an ip to get banned + +# run feedly for a data source to find the feeds +def getFeed(source, output_dir, proxy): + retry_flag = True + retry_count = 0 + retry_second = 3 + retry_num = 8 + while retry_flag and retry_count < retry_num: + try: + resp = find_feedly_feeds('https://' + source, proxy=proxy) + result = [] + for feed in resp: + print(feed) + result.append(feed) + retry_flag = False + except Exception as e: + print(f"Retry after {retry_second*retry_count**2} seconds for {source} due to: {e}") + retry_count = retry_count + 1 + time.sleep(retry_second * retry_count**2) + print(result) + df = pd.DataFrame(result) + df['source'] = source + df.to_csv(output_dir + '/' + source + '.csv') + return source + +def get_feeds_for_sources(sources, output_dir, proxy): + threads= [] + respl = [] + with ThreadPoolExecutor(max_workers=20) as executor: + for source in sources: + if os.path.exists(output_dir + '/' + source + '.csv'): + print(f"skipping {source}") + continue + sys.stdout.flush() + file_name = uuid.uuid1() + threads.append(executor.submit(getFeed, source, output_dir, proxy)) + for task in as_completed(threads): + symbol = task.result() + + +proxy = {} +# feedly requires https proxy to be set +#proxy = { +# 'http': '', +# 'https': '', +#} +output_dir="feeds" +if not os.path.exists(output_dir): + os.mkdir(output_dir) + +# open the hostnames to get feeds for +df = pd.read_csv("source.csv") +sources = df.domain.values.tolist() +get_feeds_for_sources(sources, output_dir, proxy) diff --git a/script/source.csv b/script/source.csv new file mode 100644 index 0000000..e54eb10 --- /dev/null +++ b/script/source.csv @@ -0,0 +1,11 @@ +domain,total +yahoo.com,2482877 +benzinga.com,1315542 +globenewswire.com,1127674 +prnewswire.com,768912 +indiatimes.com,728071 +einpresswire.com,522015 +finanznachrichten.de,512590 +seekingalpha.com,382240 +heraldodelbajio.com,269358 +thenelsonpost.ca,256917 From 025f79c52baf1e5ae08c06d07503c1725e4f30ec Mon Sep 17 00:00:00 2001 From: Michael Shamberger Date: Mon, 27 Feb 2023 08:10:27 +0200 Subject: [PATCH 3/3] update note --- script/feed.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/feed.py b/script/feed.py index ce5856d..c5496b7 100644 --- a/script/feed.py +++ b/script/feed.py @@ -51,7 +51,7 @@ def get_feeds_for_sources(sources, output_dir, proxy): proxy = {} -# feedly requires https proxy to be set +# if using a proxy with feedly you should set https #proxy = { # 'http': '', # 'https': '',