Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow the RssCrawler to search for an RSS feed from the provided url #285

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions newsplease/config/config.cfg
Original file line number Diff line number Diff line change
@@ -100,6 +100,17 @@ sitemap_allow_subdomains = True
# default: [] which means there will be no sitemap check
sitemap_patterns = []

# Set of Rss parent pages
# Allow to force the check of specific pages for an existing rss feed if it cannot be found from the homepage
# Here is an example of definition:
# rss_parent_pages = [
# "",
# "blog",
# "actualite",
# ]
# default: [] which means there will be no RSS check
rss_parent_pages = ['']


[Heuristics]

27 changes: 27 additions & 0 deletions newsplease/config/config_lib.cfg
Original file line number Diff line number Diff line change
@@ -86,6 +86,33 @@ ignore_regex = "(mail[tT]o)|([jJ]avascript)|(tel)|(fax)"
# default: True
sitemap_allow_subdomains = True

# Set of sitemap patterns
# Allow to force the check of specific sitemaps if it's absent from robots.txt.
# Here is an example of definition:
# sitemap_patterns = [
# "sitemap.xml",
# "post-sitemap.xml",
# "blog-posts-sitemap.xml",
# "sitemaps/post-sitemap.xml",
# "sitemap_index.xml",
# "sitemaps/sitemap_index.xml",
# "sitemaps/sitemap.xml",
# "sitemaps/sitemap-articles.xml"
# ]
# default: [] which means there will be no sitemap check
sitemap_patterns = []

# Set of Rss parent pages
# Allow to force the check of specific pages for an existing rss feed if it cannot be found from the homepage
# Here is an example of definition:
# rss_parent_pages = [
# "",
# "blog",
# "actualite",
# ]
# default: [] which means there will be no RSS check
rss_parent_pages = ['']



[Heuristics]
62 changes: 43 additions & 19 deletions newsplease/crawler/spiders/rss_crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from urllib.error import HTTPError

from requests import get
from scrapy.http import TextResponse, XmlResponse

from newsplease.config import CrawlerConfig
from newsplease.crawler.spiders.newsplease_spider import NewspleaseSpider
from newsplease.helper_classes.url_extractor import UrlExtractor

@@ -46,7 +49,7 @@ def __init__(self, helper, url, config, ignore_regex, *args, **kwargs):
if config.section("Crawler").get('check_certificate') is not None
else True)

self.start_urls = [self.helper.url_extractor.get_start_url(url)]
self.start_urls = RssCrawler._get_rss_start_urls(url=url, check_certificate=self.check_certificate)

super(RssCrawler, self).__init__(*args, **kwargs)

@@ -56,8 +59,10 @@ def parse(self, response):

:param obj response: The scrapy response
"""
rss_url = UrlExtractor.get_rss_url(response)
self.logger.info(f"Retrieving RSS articles from {rss_url}")
yield scrapy.Request(
UrlExtractor.get_rss_url(response), callback=self.rss_parse
rss_url, callback=self.rss_parse
)

def rss_parse(self, response):
@@ -100,6 +105,27 @@ def only_extracts_articles():
"""
return True

@staticmethod
def _get_rss_start_urls(url: str, check_certificate: bool = True) -> list[str]:
config = CrawlerConfig.get_instance()
rss_patterns = config.section("Crawler").get("rss_parent_pages", [''])

valid_start_urls = []
for pattern in rss_patterns:
rss_url = "http://" + UrlExtractor.get_allowed_domain(url) + '/' + pattern
try:
redirect_url = UrlExtractor.follow_redirects(url=rss_url, check_certificate=check_certificate)

# Check if a standard rss feed exists
response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read()
if response and re.search(re_rss, response.decode("utf-8")) is not None:
valid_start_urls.append(rss_url)
except HTTPError:
# 404 for this start URL, do not raise an error
pass

return valid_start_urls

@staticmethod
def supports_site(url: str, check_certificate: bool = True) -> bool:
"""
@@ -112,12 +138,7 @@ def supports_site(url: str, check_certificate: bool = True) -> bool:
:return bool: Determines wether this crawler work on the given url
"""

# Follow redirects
redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)

# Check if a standard rss feed exists
response = UrlExtractor.request_url(url=redirect_url, check_certificate=check_certificate).read()
return re.search(re_rss, response.decode("utf-8")) is not None
return len(RssCrawler._get_rss_start_urls(url=url, check_certificate=check_certificate)) > 0

@staticmethod
def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
@@ -128,19 +149,22 @@ def has_urls_to_scan(url: str, check_certificate: bool = True) -> bool:
:param bool check_certificate:
:return bool:
"""
redirect_url = UrlExtractor.follow_redirects(url=url, check_certificate=check_certificate)
urls_to_scan = []
for start_url in RssCrawler._get_rss_start_urls(url=url, check_certificate=check_certificate):

redirect_url = UrlExtractor.follow_redirects(url=start_url, check_certificate=check_certificate)

response = get(url=redirect_url, verify=check_certificate)
scrapy_response = TextResponse(url=redirect_url, body=response.text.encode())
response = get(url=redirect_url, verify=check_certificate)
scrapy_response = TextResponse(url=redirect_url, body=response.text.encode())

rss_url = UrlExtractor.get_rss_url(scrapy_response)
rss_content = get(url=rss_url, verify=check_certificate).text
rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8")
rss_url = UrlExtractor.get_rss_url(scrapy_response)
rss_content = get(url=rss_url, verify=check_certificate).text
rss_response = XmlResponse(url=rss_url, body=rss_content, encoding="utf-8")

urls_to_scan = [
url
for item in rss_response.xpath("//item")
for url in item.xpath("link/text()").extract()
]
urls_to_scan = urls_to_scan + [
url
for item in rss_response.xpath("//item")
for url in item.xpath("link/text()").extract()
]

return len(urls_to_scan) > 0