mediacloud · opme · Feb 27, 2023 · Feb 27, 2023 · Feb 27, 2023
diff --git a/feed_seeker/feed_seeker.py b/feed_seeker/feed_seeker.py
@@ -372,7 +372,8 @@ def guess_feed_links(self):
 
     def find_feedly_feeds(self,
                           max_links : int = None,
-                          throttle : int = 5):
+                          throttle : int = 5,
+                          proxy: str = None):
         """This is the class method for the find_feedly_feeds method below. Check out the
         description there for more information on how to use the method
         """
@@ -397,7 +398,7 @@ def find_feedly_feeds(self,
             params = {}
             params['query'] = url
             params['count'] = 500
-            response = requests.get(search_url,params=params)
+            response = requests.get(search_url,params=params, proxies=proxy)
             if response.status_code == 200:
                 checked_queries.add(url)
                 feeds = response.json()
@@ -472,7 +473,8 @@ def generate_feed_urls(url, html=None, spider=0, max_time=None, max_links=None):
 
 def find_feedly_feeds(url:str,
                       max_links : int = None,
-                      throttle : int = 5) -> Iterable[str]:
+                      throttle : int = 5,
+                      proxy: str = None) -> Iterable[str]:
     """Use feedly to discover feeds
     There are a few gotchas here. Sometimes searching with the top level domain
     attached doesn't yield as many results (e.g. washingtonpost.com) -- however,
@@ -482,5 +484,5 @@ def find_feedly_feeds(url:str,
     or other issues. The default throttle between requests is 5 seconds and can be
     set using the throttle parameter.
     """
-    for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle):
+    for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle,proxy=proxy):
         yield feed
diff --git a/script/feed.py b/script/feed.py
@@ -0,0 +1,66 @@
+import os
+import sys
+import os.path
+import time
+import pandas as pd
+import config as cfg
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import uuid
+from feed_seeker import find_feedly_feeds
+
+# script to find feeds for websites listed in file sources.csv using feedly.
+# use a proxy service to access feedly since running multiple threads will cause an ip to get banned
+
+# run feedly for a data source to find the feeds
+def getFeed(source, output_dir, proxy):
+    retry_flag = True
+    retry_count = 0
+    retry_second = 3
+    retry_num = 8
+    while retry_flag and retry_count < retry_num:
+        try:
+            resp = find_feedly_feeds('https://' + source, proxy=proxy)
+            result = []
+            for feed in resp:
+                print(feed)
+                result.append(feed) 
+            retry_flag = False
+        except Exception as e:
+            print(f"Retry after {retry_second*retry_count**2} seconds for {source} due to: {e}")
+            retry_count = retry_count + 1
+            time.sleep(retry_second * retry_count**2)
+    print(result)
+    df = pd.DataFrame(result)
+    df['source'] = source
+    df.to_csv(output_dir + '/' + source + '.csv')
+    return source
+
+def get_feeds_for_sources(sources, output_dir, proxy):
+    threads= []
+    respl = []
+    with ThreadPoolExecutor(max_workers=20) as executor:
+        for source in sources:
+            if os.path.exists(output_dir + '/' + source + '.csv'):
+                print(f"skipping {source}")
+                continue
+            sys.stdout.flush()
+            file_name = uuid.uuid1()
+            threads.append(executor.submit(getFeed, source, output_dir, proxy))
+        for task in as_completed(threads):
+            symbol = task.result()
+
+
+proxy = {}
+# if using a proxy with feedly you should set https
+#proxy = {
+#    'http': '',
+#    'https': '',
+#}
+output_dir="feeds"
+if not os.path.exists(output_dir):
+    os.mkdir(output_dir)
+
+# open the hostnames to get feeds for
+df = pd.read_csv("source.csv")
+sources = df.domain.values.tolist()
+get_feeds_for_sources(sources, output_dir, proxy)
diff --git a/script/source.csv b/script/source.csv
@@ -0,0 +1,11 @@
+domain,total
+yahoo.com,2482877
+benzinga.com,1315542
+globenewswire.com,1127674
+prnewswire.com,768912
+indiatimes.com,728071
+einpresswire.com,522015
+finanznachrichten.de,512590
+seekingalpha.com,382240
+heraldodelbajio.com,269358
+thenelsonpost.ca,256917