Skip to content

Commit

Permalink
Add readability to calibre and Hacker News by Tom Scholl. Fixes #8332…
Browse files Browse the repository at this point in the history
…61 (Add readability lib for use in recipes)
  • Loading branch information
kovidgoyal committed Aug 24, 2011
2 parents 7fd8bf0 + 2fc5adc commit a7f9931
Show file tree
Hide file tree
Showing 9 changed files with 812 additions and 0 deletions.
6 changes: 6 additions & 0 deletions COPYRIGHT
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ License: other
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved.

Files: src/calibre/ebooks/readability/*
Copyright: Unknown
License: Apache 2.0
The full text of the Apache 2.0 license is available at:
http://www.apache.org/licenses/LICENSE-2.0

Files: /src/cherrypy/*
Copyright: Copyright (c) 2004-2007, CherryPy Team ([email protected])
Copyright: Copyright (C) 2005, Tiago Cogumbreiro <[email protected]>
Expand Down
90 changes: 90 additions & 0 deletions recipes/hackernews.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#!/usr/bin/env python

__license__ = 'GPL v3'
'''
Hacker News
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse

class HackerNews(BasicNewsRecipe):
title = 'Hacker News'
__author__ = 'Tom Scholl'
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator'
category = 'news, programming, it, technology'
masthead_url = 'http://i55.tinypic.com/2u6io76.png'
cover_url = 'http://i55.tinypic.com/2u6io76.png'
delay = 1
max_articles_per_feed = 30
use_embedded_content = False
no_stylesheets = True
encoding = 'utf-8'
language = 'en'
requires_version = (0,8,16)

feeds = [
(u'Hacker News', 'http://news.ycombinator.com/rss')
]

temp_files = []
articles_are_obfuscated = True

def get_readable_content(self, url):
self.log('get_readable_content(' + url + ')')
br = self.get_browser()
f = br.open(url)
html = f.read()
f.close()

data = self.extract_readable_article(html, url)
article_html = data[0]
extracted_title = data[1]
article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html
return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>'

def get_hn_content(self, url):
self.log('get_hn_content(' + url + ')')
# this could be improved
br = self.get_browser()
f = br.open(url)
html = f.read()
f.close()
return html

def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url)
else:
# TODO: use content-type header instead of url
is_image = False
for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]:
if url.endswith(ext):
is_image = True
break

if is_image:
self.log('using image_content (' + url + ')')
content = u'<html><body><img src="' + url + u'"></body></html>'
else:
content = self.get_readable_content(url)

self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content)
self.temp_files[-1].close()
return self.temp_files[-1].name

def is_link_wanted(self, url, tag):
if url.endswith('.pdf'):
return False
return True

def prettyify_url(self, url):
return urlparse(url).hostname

def populate_article_metadata(self, article, soup, first):
article.text_summary = self.prettyify_url(article.url)
article.summary = article.text_summary


37 changes: 37 additions & 0 deletions src/calibre/ebooks/readability/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0

This is a python port of a ruby port of arc90's readability project, taken
from https://github.com/buriy/python-readability

The original readability project:
http://lab.arc90.com/experiments/readability/

In few words,
Given a html document, it pulls out the main body text and cleans it up.
It also can clean up title based on latest readability.js code.

Based on:
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js )
- Ruby port by starrhorne and iterationlabs
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup )
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ )
- "BR to P" fix from readability.js which improves quality for smaller texts.
- Github users contributions.

Installation::

easy_install readability-lxml
or
pip install readability-lxml

Usage::

from readability.readability import Document
import urllib
html = urllib.urlopen(url).read()
readable_article = Document(html).summary()
readable_title = Document(html).short_title()

Command-line usage::

python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml
1 change: 1 addition & 0 deletions src/calibre/ebooks/readability/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

32 changes: 32 additions & 0 deletions src/calibre/ebooks/readability/cleaners.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds
import re
from lxml.html.clean import Cleaner

bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
htmlstrip = re.compile("<" # open
"([^>]+) " # prefix
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value
"([^>]*)" # postfix
">" # end
, re.I)

def clean_attributes(html):
while htmlstrip.search(html):
html = htmlstrip.sub('<\\1\\2>', html)
return html

def normalize_spaces(s):
if not s: return ''
"""replace any sequence of whitespace
characters with a single space"""
return ' '.join(s.split())

html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
style=True, links=True, meta=False, add_nofollow=False,
page_structure=False, processing_instructions=True, embedded=False,
frames=False, forms=False, annoying_tags=False, remove_tags=None,
remove_unknown_tags=False, safe_attrs_only=False)
25 changes: 25 additions & 0 deletions src/calibre/ebooks/readability/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
def save_to_file(text, filename):
f = open(filename, 'wt')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
f.write(text.encode('utf-8'))
f.close()

uids = {}
def describe(node, depth=2):
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if name[:4] in ['div#', 'div.']:
name = name[3:]
if name in ['tr', 'td', 'div', 'p']:
if not node in uids:
uid = uids[node] = len(uids)+1
else:
uid = uids.get(node)
name += "%02d" % (uid)
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
return name
103 changes: 103 additions & 0 deletions src/calibre/ebooks/readability/htmls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import re

from lxml.html import tostring
import lxml.html

from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes
from calibre.ebooks.chardet import xml_to_unicode

def build_doc(page):
page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0]
doc = lxml.html.document_fromstring(page_unicode)
return doc

def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


def normalize_entities(cur_title):
entities = {
u'\u2014':'-',
u'\u2013':'-',
u'&mdash;': '-',
u'&ndash;': '-',
u'\u00A0': ' ',
u'\u00AB': '"',
u'\u00BB': '"',
u'&quot;': '"',
}
for c, r in entities.iteritems():
if c in cur_title:
cur_title = cur_title.replace(c, r)

return cur_title

def norm_title(title):
return normalize_entities(normalize_spaces(title))

def get_title(doc):
title = doc.find('.//title').text
if not title:
return '[no-title]'

return norm_title(title)

def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)

def shorten_title(doc):
title = doc.find('.//title').text
if not title:
return ''

title = orig = norm_title(title)

candidates = set()

for item in ['.//h1', './/h2', './/h3']:
for e in list(doc.iterfind(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)

for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)

if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [' | ', ' - ', ' :: ', ' / ']:
if delimiter in title:
parts = orig.split(delimiter)
if len(parts[0].split()) >= 4:
title = parts[0]
break
elif len(parts[-1].split()) >= 4:
title = parts[-1]
break
else:
if ': ' in title:
parts = orig.split(': ')
if len(parts[-1].split()) >= 4:
title = parts[-1]
else:
title = orig.split(': ', 1)[1]

if not 15 < len(title) < 150:
return orig

return title

def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
return clean_attributes(raw_html)

Loading

0 comments on commit a7f9931

Please sign in to comment.