Skip to content

Commit

Permalink
Better feedparsing
Browse files Browse the repository at this point in the history
For rss/atom 
* retrieving image url from top level data
* processing media_content as done for enclosure
* priorizing `summary` over `content`
  • Loading branch information
jaesivsm committed Jun 28, 2024
1 parent b28f556 commit b83a858
Show file tree
Hide file tree
Showing 9 changed files with 284 additions and 169 deletions.
1 change: 1 addition & 0 deletions jarr/controllers/article_clusterizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def get_neighbors(self, article):
article from the Clusterizer.corpus. If the corpus isn't initialized
yet, it'll be pulled out of the database.
"""

if not self.corpus_initialized:
filters = {
"__and__": [{"vector__ne": None}, {"vector__ne": ""}],
Expand Down
49 changes: 23 additions & 26 deletions jarr/controllers/feed_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from feedparser import parse as fp_parse
from requests.exceptions import ReadTimeout

from jarr.crawler.lib.feedparser_utils import browse_keys
from jarr.lib.const import FEED_MIMETYPES, GOOGLE_BOT_UA, REQUIRED_JSON_FEED
from jarr.lib.enums import FeedType
from jarr.lib.html_parsing import (extract_feed_links, extract_icon_url,
Expand Down Expand Up @@ -91,32 +92,28 @@ def is_parsed_feed(self):
def construct_from_xml_feed_content(self):
if not self.is_parsed_feed():
return {}
fp_feed = self.parsed_feed.get('feed') or {}

result = {'link': self.feed_response.url,
'site_link': fp_feed.get('href') or fp_feed.get('link'),
'title': fp_feed.get('title')}
if self.parsed_feed.get('href'):
result['link'] = self.parsed_feed.get('href')
if fp_feed.get('subtitle'):
result['description'] = fp_feed.get('subtitle')

# extracting extra links
rel_to_link = {'self': 'link', 'alternate': 'site_link'}
for link in fp_feed.get('links') or []:
if link['rel'] not in rel_to_link:
logger.info('unknown link type %r', link)
continue
if result.get(rel_to_link[link['rel']]):
logger.debug('%r already field', rel_to_link[link['rel']])
continue
result[rel_to_link[link['rel']]] = link['href']

# extracting description
if not result.get('description') \
and (fp_feed.get('subtitle_detail') or {}).get('value'):
result['description'] = fp_feed['subtitle_detail']['value']
return {key: value for key, value in result.items() if value}
fp_feed = self.parsed_feed.get("feed") or {}
result = {"link": self.feed_response.url,
"site_link": fp_feed.get("href") or fp_feed.get("link")}
if title := browse_keys(fp_feed, ("title", "title_detail"), "value"):
result["title"] = title
if description := browse_keys(
fp_feed, ("subtitle", "subtitle_detail"), "value"
):
result["description"] = description
if not result["site_link"]:
for site_link in (fp_feed.get("links") or []):
try:
if site_link["rel"] == "alternate":
result["site_link"] = site_link["href"]
break
except KeyError:
pass
try:
result["icon_url"] = fp_feed["image"]["href"]
except KeyError:
pass
return result

def construct_from_json_feed_content(self):
if not self.is_parsed_feed():
Expand Down
100 changes: 60 additions & 40 deletions jarr/crawler/article_builders/classic.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import dateutil.parser
from jarr.crawler.article_builders.abstract import AbstractArticleBuilder
from jarr.crawler.lib.feedparser_utils import browse_keys, reach_in
from jarr.lib.content_generator import get_embedded_id
from jarr.lib.utils import digest

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -34,8 +36,10 @@ def extract_date(entry):
logger.error("Couldn't parse %r", entry[date_key])

@staticmethod
def extract_title(entry):
return html.unescape(entry.get('title', ''))
def extract_title(entry, max_length=100):
possible_keys = "title", "title_detail", "content"
if title := browse_keys(entry, possible_keys, "value"):
return html.unescape(title[:max_length])

@staticmethod
def extract_tags(entry):
Expand All @@ -45,54 +49,70 @@ def extract_tags(entry):

@staticmethod
def extract_link(entry):
return entry.get('link')
return browse_keys(entry, ("link", "links"), "href")

@staticmethod
def extract_content(entry):
if entry.get('content'):
return entry['content'][0]['value']
if entry.get('summary'):
return entry['summary']
return ''
values = {}
for key in "summary", "summary_detail", "content":
for value in reach_in(entry, key, "value"):
values.add(value or "")
return sorted(values, key=lambda value: -len(value))[0] or ""

def extract_lang(self, entry):
lang = None
if entry.get('content', []):
lang = (entry['content'][0] or {}).get('language')
if not lang:
for sub_key in 'title_detail', 'summary_detail':
lang = entry.get(sub_key, {}).get('language')
if lang:
break
if not lang:
return self._top_level.get('language')
return lang
possible_keys = "summary_detail", "content", "title_detail"
lang = browse_keys(entry, possible_keys, "language")
return lang or self._top_level.get("language")

@staticmethod
def extract_comments(entry):
return entry.get('comments')

def _all_articles(self):
known_links = {self.article['link'],
self.extract_link(self.entry),
self.article['comments']}
known_links = {
value
for value in (
self.article["link"],
self.extract_link(self.entry),
self.article["comments"],
get_embedded_id(self.extract_link(self.entry)),
)
if value is not None
}
yield self.article
for i, link in enumerate(self.entry.get('links') or []):
try:
if link['rel'] != 'enclosure':
count = 0
for enclosure_type in "links", "media_content":
for enclosure in (self.entry.get(enclosure_type) or []):
cluster_member = self.template_article()
try:
content_type = enclosure["type"]
if (
enclosure_type == "links"
and enclosure.get("rel") == "enclosure"
):
cluster_member["link"] = enclosure["href"]
elif enclosure_type == "media_content":
cluster_member["link"] = enclosure["url"]
else:
continue
except (KeyError, TypeError):
continue
# Not adding cluster member twice
if cluster_member["link"] in known_links:
continue
known_links.add(cluster_member["link"])
if (
embedded_id := get_embedded_id(cluster_member["link"])
) and embedded_id in known_links:
continue
known_links.add(embedded_id)
# Not adding cluster member without type
self._feed_content_type(content_type, cluster_member)
if not cluster_member.get("article_type"):
continue
content_type = link['type']
link = link['href']
except (KeyError, TypeError):
continue
if link in known_links:
continue
known_links.add(link)
enclosure = self.template_article()
enclosure['order_in_cluster'] = i
for key, value in self.article.items():
if key in {'title', 'lang', 'link_hash', 'entry_id'}:
enclosure[key] = value
enclosure['link'] = link
self._feed_content_type(content_type, enclosure)
yield enclosure
count += 1
cluster_member["order_in_cluster"] = count
for key, value in self.article.items():
if key in {"title", "lang", "link_hash", "entry_id"}:
cluster_member[key] = value
yield cluster_member
30 changes: 30 additions & 0 deletions jarr/crawler/lib/feedparser_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Generator, List, Optional


def reach_in(
entry, key: str, sub_key: Optional[str] = None
) -> Generator[str, None, None]:
value = entry.get(key)
if isinstance(value, str):
assert sub_key in {None, "value", "href"}, (
"shouldn't reach for anything else but "
"'value' if landing on a string value"
)
yield value
elif isinstance(value, list):
for sub_value in value:
if isinstance(sub_value, dict):
if sub_value.get(sub_key):
yield sub_value[sub_key]
elif isinstance(value, dict):
if value.get(sub_key):
yield value[sub_key]


def browse_keys(
entry, keys: List[str], sub_key: Optional[str] = None
) -> Optional[str]:
for key in keys:
for value in reach_in(entry, key, sub_key):
return value
return None
5 changes: 5 additions & 0 deletions jarr/lib/content_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ def is_embedded_link(link):
return YOUTUBE_RE.match(link)


def get_embedded_id(link):
if match := YOUTUBE_RE.match(link):
return match.group(5)


class ContentGenerator:
article_type: Optional[ArticleType] = None
feed_type: Optional[FeedType] = None
Expand Down
6 changes: 0 additions & 6 deletions tests/api/oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,6 @@ def test_GoogleAuthorizeUrl_get(self):
resp.headers['Location'])
self.assertIn(id_, resp.headers['Location'])

#def test_TwitterAuthorizeUrl_get(self):
# resp = self.jarr_client('get', 'oauth', 'twitter', 'authorize_url')
# import ipdb
# ipdb.sset_trace()
# self.assertStatusCode(301, resp)

def test_LinuxfrAuthorizeUrl_get(self):
id_ = conf.oauth.linuxfr.id = 'TEH_DLFP_ID'
conf.oauth.linuxfr.secret = 'TEH_DLFP_SECRET'
Expand Down
8 changes: 5 additions & 3 deletions tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,11 @@ def assertInCluster(self, article, cluster, reason=ClusterReason.link):
self.assertEqual(
article.cluster_id,
cluster.id,
f"article {article.id}:{article.entry_id}:{article.link} cluster "
f"{article.cluster.id} is not {cluster.id}(main "
f"{cluster.main_article_id}:{cluster.main_link!r})",
f"<Article id={article.id}, eid={article.entry_id!r}, "
f"link={article.link!r}> is in <Cluster id={article.cluster.id}>; "
f"not <Cluster id={cluster.id}>(with main "
f"<Article id={cluster.main_article_id}, "
f"link={cluster.main_link!r}>)",
)
self.assertEqual(reason, article.cluster_reason)

Expand Down
Loading

0 comments on commit b83a858

Please sign in to comment.