Add crawl_from_base_domain_by_rss_crawler option in configuration to …

…allow the RssCrawler to search for an RSS feed in the provided url
fhamborg · Nov 8, 2024 · c354720 · c354720
1 parent 06d25bb
commit c354720
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 8 deletions.
diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg
@@ -45,9 +45,9 @@ check_crawler_has_urls_to_scan = False
 # Default: True
 check_certificate = True
 
-# Defines if the RssCrawler should start crawling from the base domain or the provided url
+# Defines if the RssCrawler should start crawling from the base url or the provided url
 # Default: True
-crawl_from_base_domain_by_rss_crawler = True
+crawl_from_base_url_by_rss_crawler = True
 
 # Determines how many hours need to pass since the last download of a webpage
 # to be downloaded again by the RssCrawler

diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg
@@ -51,9 +51,9 @@ check_crawler_has_urls_to_scan = False
 # Default: True
 check_certificate = True
 
-# Defines if the RssCrawler should start crawling from the base domain or the provided url
+# Defines if the RssCrawler should start crawling from the base url or the provided url
 # Default: True
-crawl_from_base_domain_by_rss_crawler = True
+crawl_from_base_url_by_rss_crawler = True
 
 # Determines how many hours need to pass since the last download of a webpage
 # to be downloaded again by the RssCrawler

diff --git a/newsplease/crawler/spiders/rss_crawler.py b/newsplease/crawler/spiders/rss_crawler.py
@@ -48,15 +48,15 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
             else True
         )
 
-        crawl_from_base_domain_by_rss_crawler = (
-            bool(config.section("Crawler").get('crawl_from_base_domain_by_rss_crawler'))
-            if config.section("Crawler").get('crawl_from_base_domain_by_rss_crawler') is not None
+        crawl_from_base_url_by_rss_crawler = (
+            bool(config.section("Crawler").get('crawl_from_base_url_by_rss_crawler'))
+            if config.section("Crawler").get('crawl_from_base_url_by_rss_crawler') is not None
             else True
         )
 
         self.start_urls = [
             self.helper.url_extractor.get_start_url(url)
-            if crawl_from_base_domain_by_rss_crawler
+            if crawl_from_base_url_by_rss_crawler
             else url
         ]