From 7c5c84690feaee55e5a07d92a556a2f143e773fd Mon Sep 17 00:00:00 2001
From: Michael Shamberger <ext-michael.shamberger@finago.com>
Date: Mon, 27 Feb 2023 07:59:15 +0200
Subject: [PATCH 1/3] add proxy

---
 feed_seeker/feed_seeker.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/feed_seeker/feed_seeker.py b/feed_seeker/feed_seeker.py
index cc567f6..eedb4a5 100755
--- a/feed_seeker/feed_seeker.py
+++ b/feed_seeker/feed_seeker.py
@@ -372,7 +372,8 @@ def guess_feed_links(self):
 
     def find_feedly_feeds(self,
                           max_links : int = None,
-                          throttle : int = 5):
+                          throttle : int = 5,
+                          proxy: str = None):
         """This is the class method for the find_feedly_feeds method below. Check out the
         description there for more information on how to use the method
         """
@@ -397,7 +398,7 @@ def find_feedly_feeds(self,
             params = {}
             params['query'] = url
             params['count'] = 500
-            response = requests.get(search_url,params=params)
+            response = requests.get(search_url,params=params, proxies=proxy)
             if response.status_code == 200:
                 checked_queries.add(url)
                 feeds = response.json()
@@ -472,7 +473,8 @@ def generate_feed_urls(url, html=None, spider=0, max_time=None, max_links=None):
 
 def find_feedly_feeds(url:str,
                       max_links : int = None,
-                      throttle : int = 5) -> Iterable[str]:
+                      throttle : int = 5,
+                      proxy: str = None) -> Iterable[str]:
     """Use feedly to discover feeds
     There are a few gotchas here. Sometimes searching with the top level domain
     attached doesn't yield as many results (e.g. washingtonpost.com) -- however,
@@ -482,5 +484,5 @@ def find_feedly_feeds(url:str,
     or other issues. The default throttle between requests is 5 seconds and can be
     set using the throttle parameter.
     """
-    for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle):
+    for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle,proxy=proxy):
         yield feed

From aeab8bd834861b732137a85c475c998a4b28b4d2 Mon Sep 17 00:00:00 2001
From: Michael Shamberger <ext-michael.shamberger@finago.com>
Date: Mon, 27 Feb 2023 08:08:00 +0200
Subject: [PATCH 2/3] scripts for accessing feedly in bulk

---
 script/feed.py    | 66 +++++++++++++++++++++++++++++++++++++++++++++++
 script/source.csv | 11 ++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 script/feed.py
 create mode 100644 script/source.csv

diff --git a/script/feed.py b/script/feed.py
new file mode 100644
index 0000000..ce5856d
--- /dev/null
+++ b/script/feed.py
@@ -0,0 +1,66 @@
+import os
+import sys
+import os.path
+import time
+import pandas as pd
+import config as cfg
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import uuid
+from feed_seeker import find_feedly_feeds
+
+# script to find feeds for websites listed in file sources.csv using feedly.
+# use a proxy service to access feedly since running multiple threads will cause an ip to get banned
+
+# run feedly for a data source to find the feeds
+def getFeed(source, output_dir, proxy):
+    retry_flag = True
+    retry_count = 0
+    retry_second = 3
+    retry_num = 8
+    while retry_flag and retry_count < retry_num:
+        try:
+            resp = find_feedly_feeds('https://' + source, proxy=proxy)
+            result = []
+            for feed in resp:
+                print(feed)
+                result.append(feed) 
+            retry_flag = False
+        except Exception as e:
+            print(f"Retry after {retry_second*retry_count**2} seconds for {source} due to: {e}")
+            retry_count = retry_count + 1
+            time.sleep(retry_second * retry_count**2)
+    print(result)
+    df = pd.DataFrame(result)
+    df['source'] = source
+    df.to_csv(output_dir + '/' + source + '.csv')
+    return source
+
+def get_feeds_for_sources(sources, output_dir, proxy):
+    threads= []
+    respl = []
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        for source in sources:
+            if os.path.exists(output_dir + '/' + source + '.csv'):
+                print(f"skipping {source}")
+                continue
+            sys.stdout.flush()
+            file_name = uuid.uuid1()
+            threads.append(executor.submit(getFeed, source, output_dir, proxy))
+        for task in as_completed(threads):
+            symbol = task.result()
+
+
+proxy = {}
+# feedly requires https proxy to be set
+#proxy = {
+#    'http': '',
+#    'https': '',
+#}
+output_dir="feeds"
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+
+# open the hostnames to get feeds for
+df = pd.read_csv("source.csv")
+sources = df.domain.values.tolist()
+get_feeds_for_sources(sources, output_dir, proxy)
diff --git a/script/source.csv b/script/source.csv
new file mode 100644
index 0000000..e54eb10
--- /dev/null
+++ b/script/source.csv
@@ -0,0 +1,11 @@
+domain,total
+yahoo.com,2482877
+benzinga.com,1315542
+globenewswire.com,1127674
+prnewswire.com,768912
+indiatimes.com,728071
+einpresswire.com,522015
+finanznachrichten.de,512590
+seekingalpha.com,382240
+heraldodelbajio.com,269358
+thenelsonpost.ca,256917

From 025f79c52baf1e5ae08c06d07503c1725e4f30ec Mon Sep 17 00:00:00 2001
From: Michael Shamberger <ext-michael.shamberger@finago.com>
Date: Mon, 27 Feb 2023 08:10:27 +0200
Subject: [PATCH 3/3] update note

---
 script/feed.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/script/feed.py b/script/feed.py
index ce5856d..c5496b7 100644
--- a/script/feed.py
+++ b/script/feed.py
@@ -51,7 +51,7 @@ def get_feeds_for_sources(sources, output_dir, proxy):
 
 
 proxy = {}
-# feedly requires https proxy to be set
+# if using a proxy with feedly you should set https
 #proxy = {
 #    'http': '',
 #    'https': '',