Add readability to calibre and Hacker News by Tom Scholl. Fixes #8332…

…61 (Add readability lib for use in recipes)
Q-Qaysaneah · Aug 24, 2011 · a7f9931 · a7f9931
2 parents 7fd8bf0 + 2fc5adc
commit a7f9931
Show file tree

Hide file tree

Showing 9 changed files with 812 additions and 0 deletions.
diff --git a/COPYRIGHT b/COPYRIGHT
@@ -28,6 +28,12 @@ License: other
  are permitted in any medium without royalty provided the copyright
  notice and this notice are preserved.
 
+Files: src/calibre/ebooks/readability/*
+Copyright: Unknown
+License: Apache 2.0
+  The full text of the Apache 2.0 license is available at:
+  http://www.apache.org/licenses/LICENSE-2.0
+
 Files: /src/cherrypy/*
 Copyright: Copyright (c) 2004-2007, CherryPy Team ([email protected])
 Copyright: Copyright (C) 2005, Tiago Cogumbreiro <[email protected]>

diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
@@ -0,0 +1,90 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+'''
+Hacker News
+'''
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import PersistentTemporaryFile
+from urlparse import urlparse
+
+class HackerNews(BasicNewsRecipe):
+    title                 = 'Hacker News'
+    __author__            = 'Tom Scholl'
+    description           = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
+    publisher             = 'Y Combinator'
+    category              = 'news, programming, it, technology'
+    masthead_url          = 'http://i55.tinypic.com/2u6io76.png'
+    cover_url             = 'http://i55.tinypic.com/2u6io76.png'
+    delay                 = 1
+    max_articles_per_feed = 30
+    use_embedded_content  = False
+    no_stylesheets        = True
+    encoding              = 'utf-8'
+    language              = 'en'
+    requires_version      = (0,8,16)
+
+    feeds = [
+                (u'Hacker News', 'http://news.ycombinator.com/rss')
+            ]
+
+    temp_files = []
+    articles_are_obfuscated = True
+
+    def get_readable_content(self, url):
+        self.log('get_readable_content(' + url + ')')
+        br = self.get_browser()
+        f = br.open(url)
+        html = f.read()
+        f.close()
+
+        data = self.extract_readable_article(html, url)
+        article_html = data[0]
+        extracted_title = data[1]
+        article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
+        return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'
+
+    def get_hn_content(self, url):
+        self.log('get_hn_content(' + url + ')')
+        # this could be improved
+        br = self.get_browser()
+        f = br.open(url)
+        html = f.read()
+        f.close()
+        return html
+
+    def get_obfuscated_article(self, url):
+        if url.startswith('http://news.ycombinator.com'):
+            content = self.get_hn_content(url)
+        else:
+            # TODO: use content-type header instead of url
+            is_image = False
+            for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
+                if url.endswith(ext):
+                    is_image = True
+                    break
+
+            if is_image:
+                self.log('using image_content (' + url + ')')
+                content = u'<html><body><img src="' + url + u'"></body></html>'
+            else:
+                content = self.get_readable_content(url)
+
+        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
+        self.temp_files[-1].write(content)
+        self.temp_files[-1].close()
+        return self.temp_files[-1].name
+
+    def is_link_wanted(self, url, tag):
+        if url.endswith('.pdf'):
+            return False
+        return True
+
+    def prettyify_url(self, url):
+        return urlparse(url).hostname
+
+    def populate_article_metadata(self, article, soup, first):
+        article.text_summary = self.prettyify_url(article.url)
+        article.summary = article.text_summary
+
+
diff --git a/src/calibre/ebooks/readability/README.txt b/src/calibre/ebooks/readability/README.txt
@@ -0,0 +1,37 @@
+This code is under the Apache License 2.0.  http://www.apache.org/licenses/LICENSE-2.0
+
+This is a python port of a ruby port of arc90's readability project, taken
+from https://github.com/buriy/python-readability
+
+The original readability project:
+http://lab.arc90.com/experiments/readability/
+
+In few words,
+Given a html document, it pulls out the main body text and cleans it up.
+It also can clean up title based on latest readability.js code.
+
+Based on:
+ - Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
+ - Ruby port by starrhorne and iterationlabs
+ - Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
+ - Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
+ - "BR to P" fix from readability.js which improves quality for smaller texts.
+ - Github users contributions.
+
+Installation::
+
+    easy_install readability-lxml
+    or
+    pip install readability-lxml
+
+Usage::
+
+    from readability.readability import Document
+    import urllib
+    html = urllib.urlopen(url).read()
+    readable_article = Document(html).summary()
+    readable_title = Document(html).short_title()
+
+Command-line usage::
+
+    python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
diff --git a/src/calibre/ebooks/readability/__init__.py b/src/calibre/ebooks/readability/__init__.py
@@ -0,0 +1 @@
+
diff --git a/src/calibre/ebooks/readability/cleaners.py b/src/calibre/ebooks/readability/cleaners.py
@@ -0,0 +1,32 @@
+# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
+import re
+from lxml.html.clean import Cleaner
+
+bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+single_quoted = "'[^']+'"
+double_quoted = '"[^"]+"'
+non_space = '[^ "\'>]+'
+htmlstrip = re.compile("<" # open
+    "([^>]+) " # prefix
+    "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
+    '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
+    "([^>]*)"  # postfix
+    ">"        # end
+, re.I)
+
+def clean_attributes(html):
+    while htmlstrip.search(html):
+        html = htmlstrip.sub('<\\1\\2>', html)
+    return html
+
+def normalize_spaces(s):
+    if not s: return ''
+    """replace any sequence of whitespace
+    characters with a single space"""
+    return ' '.join(s.split())
+
+html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
+                  style=True, links=True, meta=False, add_nofollow=False,
+                  page_structure=False, processing_instructions=True, embedded=False,
+                  frames=False, forms=False, annoying_tags=False, remove_tags=None,
+                  remove_unknown_tags=False, safe_attrs_only=False)
diff --git a/src/calibre/ebooks/readability/debug.py b/src/calibre/ebooks/readability/debug.py
@@ -0,0 +1,25 @@
+def save_to_file(text, filename):
+    f = open(filename, 'wt')
+    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
+    f.write(text.encode('utf-8'))
+    f.close()
+
+uids = {} 
+def describe(node, depth=2):
+    if not hasattr(node, 'tag'):
+        return "[%s]" % type(node)
+    name = node.tag
+    if node.get('id', ''): name += '#'+node.get('id') 
+    if node.get('class', ''): 
+        name += '.' + node.get('class').replace(' ','.')
+    if name[:4] in ['div#', 'div.']:
+        name = name[3:]
+    if name in ['tr', 'td', 'div', 'p']:
+        if not node in uids:
+            uid = uids[node] = len(uids)+1
+        else:
+            uid = uids.get(node)
+        name += "%02d" % (uid)
+    if depth and node.getparent() is not None:
+        return name+' - '+describe(node.getparent(), depth-1)
+    return name
diff --git a/src/calibre/ebooks/readability/htmls.py b/src/calibre/ebooks/readability/htmls.py
@@ -0,0 +1,103 @@
+import re
+
+from lxml.html import tostring
+import lxml.html
+
+from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes
+from calibre.ebooks.chardet import xml_to_unicode
+
+def build_doc(page):
+    page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0]
+    doc = lxml.html.document_fromstring(page_unicode)
+    return doc
+
+def js_re(src, pattern, flags, repl):
+    return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
+
+
+def normalize_entities(cur_title):
+    entities = {
+        u'\u2014':'-',
+        u'\u2013':'-',
+        u'&mdash;': '-',
+        u'&ndash;': '-',
+        u'\u00A0': ' ',
+        u'\u00AB': '"',
+        u'\u00BB': '"',
+        u'&quot;': '"',
+    }
+    for c, r in entities.iteritems():
+        if c in cur_title:
+            cur_title = cur_title.replace(c, r)
+
+    return cur_title
+
+def norm_title(title):
+    return normalize_entities(normalize_spaces(title))
+
+def get_title(doc):
+    title = doc.find('.//title').text
+    if not title:
+        return '[no-title]'
+
+    return norm_title(title)
+
+def add_match(collection, text, orig):
+    text = norm_title(text)
+    if len(text.split()) >= 2 and len(text) >= 15:
+        if text.replace('"', '') in orig.replace('"', ''):
+            collection.add(text)
+
+def shorten_title(doc):
+    title = doc.find('.//title').text
+    if not title:
+        return ''
+
+    title = orig = norm_title(title)
+
+    candidates = set()
+
+    for item in ['.//h1', './/h2', './/h3']:
+        for e in list(doc.iterfind(item)):
+            if e.text:
+                add_match(candidates, e.text, orig)
+            if e.text_content():
+                add_match(candidates, e.text_content(), orig)
+
+    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+        for e in doc.cssselect(item):
+            if e.text:
+                add_match(candidates, e.text, orig)
+            if e.text_content():
+                add_match(candidates, e.text_content(), orig)
+
+    if candidates:
+        title = sorted(candidates, key=len)[-1]
+    else:
+        for delimiter in [' | ', ' - ', ' :: ', ' / ']:
+            if delimiter in title:
+                parts = orig.split(delimiter)
+                if len(parts[0].split()) >= 4:
+                    title = parts[0]
+                    break
+                elif len(parts[-1].split()) >= 4:
+                    title = parts[-1]
+                    break
+        else:
+            if ': ' in title:
+                parts = orig.split(': ')
+                if len(parts[-1].split()) >= 4:
+                    title = parts[-1]
+                else:
+                    title = orig.split(': ', 1)[1]
+
+    if not 15 < len(title) < 150:
+        return orig
+
+    return title
+
+def get_body(doc):
+    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
+    raw_html = unicode(tostring(doc.body or doc))
+    return clean_attributes(raw_html)
+