Skip to content

Commit

Permalink
Add crawl_from_base_domain_by_rss_crawler option in configuration to …
Browse files Browse the repository at this point in the history
…allow the RssCrawler to search for an RSS feed in the provided url
  • Loading branch information
yldoctrine committed Nov 8, 2024
1 parent 06d25bb commit c354720
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 8 deletions.
4 changes: 2 additions & 2 deletions newsplease/config/config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ check_crawler_has_urls_to_scan = False
# Default: True
check_certificate = True

# Defines if the RssCrawler should start crawling from the base domain or the provided url
# Defines if the RssCrawler should start crawling from the base url or the provided url
# Default: True
crawl_from_base_domain_by_rss_crawler = True
crawl_from_base_url_by_rss_crawler = True

# Determines how many hours need to pass since the last download of a webpage
# to be downloaded again by the RssCrawler
Expand Down
4 changes: 2 additions & 2 deletions newsplease/config/config_lib.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ check_crawler_has_urls_to_scan = False
# Default: True
check_certificate = True

# Defines if the RssCrawler should start crawling from the base domain or the provided url
# Defines if the RssCrawler should start crawling from the base url or the provided url
# Default: True
crawl_from_base_domain_by_rss_crawler = True
crawl_from_base_url_by_rss_crawler = True

# Determines how many hours need to pass since the last download of a webpage
# to be downloaded again by the RssCrawler
Expand Down
8 changes: 4 additions & 4 deletions newsplease/crawler/spiders/rss_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
else True
)

crawl_from_base_domain_by_rss_crawler = (
bool(config.section("Crawler").get('crawl_from_base_domain_by_rss_crawler'))
if config.section("Crawler").get('crawl_from_base_domain_by_rss_crawler') is not None
crawl_from_base_url_by_rss_crawler = (
bool(config.section("Crawler").get('crawl_from_base_url_by_rss_crawler'))
if config.section("Crawler").get('crawl_from_base_url_by_rss_crawler') is not None
else True
)

self.start_urls = [
self.helper.url_extractor.get_start_url(url)
if crawl_from_base_domain_by_rss_crawler
if crawl_from_base_url_by_rss_crawler
else url
]

Expand Down

0 comments on commit c354720

Please sign in to comment.