diff --git a/newsplease/config/config.cfg b/newsplease/config/config.cfg index a9a8de78..2bae2009 100644 --- a/newsplease/config/config.cfg +++ b/newsplease/config/config.cfg @@ -45,9 +45,9 @@ check_crawler_has_urls_to_scan = False # Default: True check_certificate = True -# Defines if the RssCrawler should start crawling from the base domain or the provided url +# Defines if the RssCrawler should start crawling from the base url or the provided url # Default: True -crawl_from_base_domain_by_rss_crawler = True +crawl_from_base_url_by_rss_crawler = True # Determines how many hours need to pass since the last download of a webpage # to be downloaded again by the RssCrawler diff --git a/newsplease/config/config_lib.cfg b/newsplease/config/config_lib.cfg index 859c3a08..ce22a4fb 100644 --- a/newsplease/config/config_lib.cfg +++ b/newsplease/config/config_lib.cfg @@ -51,9 +51,9 @@ check_crawler_has_urls_to_scan = False # Default: True check_certificate = True -# Defines if the RssCrawler should start crawling from the base domain or the provided url +# Defines if the RssCrawler should start crawling from the base url or the provided url # Default: True -crawl_from_base_domain_by_rss_crawler = True +crawl_from_base_url_by_rss_crawler = True # Determines how many hours need to pass since the last download of a webpage # to be downloaded again by the RssCrawler diff --git a/newsplease/crawler/spiders/rss_crawler.py b/newsplease/crawler/spiders/rss_crawler.py index 9b67fc3a..0dbfeef1 100644 --- a/newsplease/crawler/spiders/rss_crawler.py +++ b/newsplease/crawler/spiders/rss_crawler.py @@ -48,15 +48,15 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs): else True ) - crawl_from_base_domain_by_rss_crawler = ( - bool(config.section("Crawler").get('crawl_from_base_domain_by_rss_crawler')) - if config.section("Crawler").get('crawl_from_base_domain_by_rss_crawler') is not None + crawl_from_base_url_by_rss_crawler = ( + bool(config.section("Crawler").get('crawl_from_base_url_by_rss_crawler')) + if config.section("Crawler").get('crawl_from_base_url_by_rss_crawler') is not None else True ) self.start_urls = [ self.helper.url_extractor.get_start_url(url) - if crawl_from_base_domain_by_rss_crawler + if crawl_from_base_url_by_rss_crawler else url ]