diff --git a/Pipfile b/Pipfile index 9c34363e..e620c743 100644 --- a/Pipfile +++ b/Pipfile @@ -37,7 +37,6 @@ flask-restx = "==1.*" goose3 = "==3.*" gunicorn = "==22.*" json-logging-py = "*" # needed by gunicorn config -lxml = "==5.*" opml = "==0.*" prometheus-distributed-client = "==1.*" psycopg2-binary = "==2.*" diff --git a/Pipfile.lock b/Pipfile.lock index d9cf7bb6..a5582e81 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "7543ae2282afeb5f616d1f2740f823eefb81d9d913e70a82160b6599bb0b03d7" + "sha256": "0231a5b948b0d6b4e88fb162e61b30d81a8a8870461ca9ce329611059817271f" }, "pipfile-spec": 6, "requires": { @@ -691,7 +691,7 @@ "sha256:ff097ae562e637409b429a7ac958a20aab237a0378c42dabaa1e3abf2f896e5f", "sha256:ff46d772d5f6f73564979cd77a4fffe55c916a05f3cb70e7c9c0590059fb29ef" ], - "index": "pypi", + "markers": "python_version >= '3.6'", "version": "==5.2.1" }, "mako": { diff --git a/jarr/lib/content_generator.py b/jarr/lib/content_generator.py index 2eaa1309..f8b32c6e 100644 --- a/jarr/lib/content_generator.py +++ b/jarr/lib/content_generator.py @@ -5,7 +5,6 @@ from typing import Optional from goose3 import Goose -from lxml import etree from jarr.bootstrap import conf from jarr.controllers.article import to_vector from jarr.lib.enums import ArticleType, FeedType @@ -15,8 +14,9 @@ logger = logging.getLogger(__name__) IMG_ALT_MAX_LENGTH = 100 YOUTUBE_RE = re.compile( - r'^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))' - r'(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$') + r"^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))" + r"(\/(?:[\w\-]+\?v=|embed\/|v\/)?)([\w\-]+)(\S+)?$" +) def is_embedded_link(link): @@ -37,16 +37,16 @@ def _get_goose(self): try: self._page = goose.extract(self.article.link) except Exception as error: - logger.error("something wrong happened while trying to fetch " - "%r: %r", self.article.link, error) + msg = "something wrong happened while trying to fetch %r: %r" + logger.error(msg, self.article.link, error) if not self._page: return False - lang = self._page.opengraph.get('locale') or self._page.meta_lang - self.extracted_infos['lang'] = clean_lang(lang) - self.extracted_infos['link'] = self._page.final_url - keywords = set(self._page.meta_keywords.split(', ')) - self.extracted_infos['tags'] = set(self._page.tags).union(keywords) - self.extracted_infos['title'] = self._page.title + lang = self._page.opengraph.get("locale") or self._page.meta_lang + self.extracted_infos["lang"] = clean_lang(lang) + self.extracted_infos["link"] = self._page.final_url + keywords = set(self._page.meta_keywords.split(", ")) + self.extracted_infos["tags"] = set(self._page.tags).union(keywords) + self.extracted_infos["title"] = self._page.title return True def get_vector(self): @@ -55,16 +55,6 @@ def get_vector(self): if self._page and self.extracted_infos: return to_vector(self.extracted_infos, self._page) - def _from_goose_to_html(self, encoding="utf8"): - result = b"" - current_node = self._page.top_node - while True: - result += etree.tostring(current_node, encoding=encoding) - current_node = current_node.getnext() - if current_node is None: - break - return result.replace(b'\n', b' ').decode(encoding) - @staticmethod def generate(): return {} @@ -72,14 +62,16 @@ def generate(): def generate_and_merge(self, content): content = migrate_content(content) # if there is already some fetched content - already_fetched = any(cnt.get('type') == 'fetched' - for cnt in content.get('contents') or []) + already_fetched = any( + cnt.get("type") == "fetched" + for cnt in content.get("contents") or [] + ) if isinstance(self, TruncatedContentGenerator) and already_fetched: return content article_content = self.generate() if not article_content: return content - content['contents'].append(article_content) + content["contents"].append(article_content) return content @@ -120,15 +112,15 @@ def get_vector(): def generate(self): yt_match = YOUTUBE_RE.match(self.article.link) if yt_match: - logger.info('%r constructing embedded youtube content ' - 'from article', self.article) + msg = "%r constructing embedded youtube content from article" + logger.info(msg, self.article) try: - return {'type': 'youtube', 'link': yt_match.group(5)} + return {"type": "youtube", "link": yt_match.group(5)} except IndexError: pass else: - logger.warning('embedded video not recognized %r', - self.article.link) + msg = "embedded media not recognized %r" + logger.warning(msg, self.article.link) return {} @@ -137,18 +129,18 @@ class TruncatedContentGenerator(ContentGenerator): def generate(self): if self._page is None: self._get_goose() - content = {'type': 'fetched'} + content = {"type": "fetched"} try: - content['content'] = self._from_goose_to_html() - content['link'] = remove_utm_tags(self._page.final_url) - content['title'] = self._page.title + content["content"] = self._page.top_node_raw_html + content["link"] = remove_utm_tags(self._page.final_url) + content["title"] = self._page.title except Exception: - logger.exception("Could not rebuild parsed content for %r", - self.article) + msg = "Could not rebuild parsed content for %r" + logger.exception(msg, self.article) return {} if self.article.comments: - content['comments'] = self.article.comments - logger.debug('%r no special type found doing nothing', self.article) + content["comments"] = self.article.comments + logger.debug("%r no special type found doing nothing", self.article) return content @@ -168,9 +160,12 @@ def is_pure_reddit_post(self): return self._is_pure_reddit_post try: split = urllib.parse.urlsplit(self.article.link) - paths = split.path.strip('/').split('/') - if ('reddit.com' in split.netloc - and paths[0] == 'r' and paths[2] == 'comments'): + paths = split.path.strip("/").split("/") + if ( + "reddit.com" in split.netloc + and paths[0] == "r" + and paths[2] == "comments" + ): self._is_pure_reddit_post = True except (AttributeError, IndexError): pass @@ -207,8 +202,7 @@ def get_content_generator(article): if article.article_type and article.article_type in CONTENT_GENERATORS: return CONTENT_GENERATORS[article.article_type](article) - if article.feed.feed_type \ - and article.feed.feed_type in CONTENT_GENERATORS: + if article.feed.feed_type and article.feed.feed_type in CONTENT_GENERATORS: return CONTENT_GENERATORS[article.feed.feed_type](article) if article.feed.truncated_content: @@ -218,12 +212,11 @@ def get_content_generator(article): def migrate_content(content: dict): - content = content or {'v': 2, 'contents': []} - if content.get('v') == 2: + content = content or {"v": 2, "contents": []} + if content.get("v") == 2: return content - if content['type'] in {'image', 'audio', 'video'}: - return {'v': 2, 'contents': []} - if content['type'] == 'embedded': # migrating original embedded - return {'v': 2, 'contents': [{'type': content['player'], - 'link': content['videoId']}]} - return {'v': 2, 'contents': [content]} + if content["type"] in {"image", "audio", "video"}: + return {"v": 2, "contents": []} + if content["type"] == "embedded": # migrating original embedded + content = {"type": content["player"], "link": content["videoId"]} + return {"v": 2, "contents": [content]}