fhamborg · yldoctrine · Nov 18, 2024 · Nov 18, 2024
diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg
@@ -100,6 +100,17 @@ sitemap_allow_subdomains = True
 # default: [] which means there will be no sitemap check
 sitemap_patterns = []
 
+# Set of Rss parent pages
+# Allow to force the check of specific pages for an existing rss feed if it cannot be found from the homepage
+# Here is an example of definition:
+# rss_parent_pages = [
+#   "",
+#   "blog",
+#   "actualite",
+#   ]
+# default: [] which means there will be no RSS check
+rss_parent_pages = ['']
+
 
 [Heuristics]
 

diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg
@@ -86,6 +86,33 @@ ignore_regex = "(mail[tT]o)|([jJ]avascript)|(tel)|(fax)"
 # default: True
 sitemap_allow_subdomains = True
 
+# Set of sitemap patterns
+# Allow to force the check of specific sitemaps if it's absent from robots.txt.
+# Here is an example of definition:
+# sitemap_patterns = [
+#   "sitemap.xml",
+#   "post-sitemap.xml",
+#   "blog-posts-sitemap.xml",
+#   "sitemaps/post-sitemap.xml",
+#   "sitemap_index.xml",
+#   "sitemaps/sitemap_index.xml",
+#   "sitemaps/sitemap.xml",
+#   "sitemaps/sitemap-articles.xml"
+#   ]
+# default: [] which means there will be no sitemap check
+sitemap_patterns = []
+
+# Set of Rss parent pages
+# Allow to force the check of specific pages for an existing rss feed if it cannot be found from the homepage
+# Here is an example of definition:
+# rss_parent_pages = [
+#   "",
+#   "blog",
+#   "actualite",
+#   ]
+# default: [] which means there will be no RSS check
+rss_parent_pages = ['']
+
 
 
 [Heuristics]

diff --git a/newsplease/crawler/spiders/rss_crawler.py b/newsplease/crawler/spiders/rss_crawler.py
@@ -1,6 +1,9 @@
+from urllib.error import HTTPError
+
 from requests import get
 from scrapy.http import TextResponse, XmlResponse
 
+from newsplease.config import CrawlerConfig
 from newsplease.crawler.spiders.newsplease_spider import NewspleaseSpider
 from newsplease.helper_classes.url_extractor import UrlExtractor
 
@@ -46,7 +49,7 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
                                   if config.section("Crawler").get('check_certificate') is not None
                                   else True)
 
-        self.start_urls = [self.helper.url_extractor.get_start_url(url)]
+        self.start_urls = RssCrawler._get_rss_start_urls(url=url, check_certificate=self.check_certificate)
 
         super(RssCrawler, self).__init__(*args, **kwargs)
 
@@ -56,8 +59,10 @@ def parse(self, response):
 
         :param obj response: The scrapy response
         """
+        rss_url = UrlExtractor.get_rss_url(response)
+        self.logger.info(f"Retrieving RSS articles from {rss_url}")
         yield scrapy.Request(
-            UrlExtractor.get_rss_url(response), callback=self.rss_parse
+            rss_url, callback=self.rss_parse
         )
 
     def rss_parse(self, response):
@@ -100,6 +105,27 @@ def only_extracts_articles():
         """
         return True
 
+    @staticmethod
+    def _get_rss_start_urls(url: str, check_certificate: bool = True) -> list[str]:
+        config = CrawlerConfig.get_instance()
+        rss_patterns = config.section("Crawler").get("rss_parent_pages", [''])
+
+        valid_start_urls = []
+        for pattern in rss_patterns:
+            rss_url = "http://" + UrlExtractor.get_allowed_domain(url) + '/' + pattern
+            try:
+                redirect_url = UrlExtractor.follow_redirects(url=rss_url, check_certificate=check_certificate)
+
+                # Check if a standard rss feed exists
+                response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read()
+                if response and re.search(re_rss, response.decode("utf-8")) is not None:
+                    valid_start_urls.append(rss_url)
+            except HTTPError:
+                # 404 for this start URL, do not raise an error
+                pass
+
+        return valid_start_urls
+
     @staticmethod
     def supports_site(url: str, check_certificate: bool = True) -> bool:
         """
@@ -112,12 +138,7 @@ def supports_site(url: str, check_certificate: bool = True) -> bool:
         :return bool: Determines wether this crawler work on the given url
         """
 
-        # Follow redirects
-        redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)
-
-        # Check if a standard rss feed exists
-        response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read()
-        return re.search(re_rss, response.decode("utf-8")) is not None
+        return len(RssCrawler._get_rss_start_urls(url=url, check_certificate=check_certificate)) > 0
 
     @staticmethod
     def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
@@ -128,19 +149,22 @@ def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
         :param bool check_certificate:
         :return bool:
         """
-        redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)
+        urls_to_scan = []
+        for start_url in RssCrawler._get_rss_start_urls(url=url, check_certificate=check_certificate):
+
+            redirect_url = UrlExtractor.follow_redirects(url=start_url, check_certificate=check_certificate)
 
-        response = get(url=redirect_url, verify=check_certificate)
-        scrapy_response = TextResponse(url=redirect_url, body=response.text.encode())
+            response = get(url=redirect_url, verify=check_certificate)
+            scrapy_response = TextResponse(url=redirect_url, body=response.text.encode())
 
-        rss_url = UrlExtractor.get_rss_url(scrapy_response)
-        rss_content = get(url=rss_url, verify=check_certificate).text
-        rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8")
+            rss_url = UrlExtractor.get_rss_url(scrapy_response)
+            rss_content = get(url=rss_url, verify=check_certificate).text
+            rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8")
 
-        urls_to_scan = [
-            url
-            for item in rss_response.xpath("//item")
-            for url in item.xpath("link/text()").extract()
-        ]
+            urls_to_scan = urls_to_scan + [
+                url
+                for item in rss_response.xpath("//item")
+                for url in item.xpath("link/text()").extract()
+            ]
 
         return len(urls_to_scan) > 0