forked from kovidgoyal/calibre
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add readability to calibre and Hacker News by Tom Scholl. Fixes #8332…
…61 (Add readability lib for use in recipes)
- Loading branch information
Showing
9 changed files
with
812 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,12 @@ License: other | |
are permitted in any medium without royalty provided the copyright | ||
notice and this notice are preserved. | ||
|
||
Files: src/calibre/ebooks/readability/* | ||
Copyright: Unknown | ||
License: Apache 2.0 | ||
The full text of the Apache 2.0 license is available at: | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Files: /src/cherrypy/* | ||
Copyright: Copyright (c) 2004-2007, CherryPy Team ([email protected]) | ||
Copyright: Copyright (C) 2005, Tiago Cogumbreiro <[email protected]> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
#!/usr/bin/env python | ||
|
||
__license__ = 'GPL v3' | ||
''' | ||
Hacker News | ||
''' | ||
from calibre.web.feeds.news import BasicNewsRecipe | ||
from calibre.ptempfile import PersistentTemporaryFile | ||
from urlparse import urlparse | ||
|
||
class HackerNews(BasicNewsRecipe): | ||
title = 'Hacker News' | ||
__author__ = 'Tom Scholl' | ||
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.' | ||
publisher = 'Y Combinator' | ||
category = 'news, programming, it, technology' | ||
masthead_url = 'http://i55.tinypic.com/2u6io76.png' | ||
cover_url = 'http://i55.tinypic.com/2u6io76.png' | ||
delay = 1 | ||
max_articles_per_feed = 30 | ||
use_embedded_content = False | ||
no_stylesheets = True | ||
encoding = 'utf-8' | ||
language = 'en' | ||
requires_version = (0,8,16) | ||
|
||
feeds = [ | ||
(u'Hacker News', 'http://news.ycombinator.com/rss') | ||
] | ||
|
||
temp_files = [] | ||
articles_are_obfuscated = True | ||
|
||
def get_readable_content(self, url): | ||
self.log('get_readable_content(' + url + ')') | ||
br = self.get_browser() | ||
f = br.open(url) | ||
html = f.read() | ||
f.close() | ||
|
||
data = self.extract_readable_article(html, url) | ||
article_html = data[0] | ||
extracted_title = data[1] | ||
article_html = u'<cite><strong>' + extracted_title + u'</strong></cite><span> (' + self.prettyify_url(url) + u')</span><br/>' + article_html | ||
return u'<html><head><title>' + extracted_title + u'</title></head><body>' + article_html + u'</body></html>' | ||
|
||
def get_hn_content(self, url): | ||
self.log('get_hn_content(' + url + ')') | ||
# this could be improved | ||
br = self.get_browser() | ||
f = br.open(url) | ||
html = f.read() | ||
f.close() | ||
return html | ||
|
||
def get_obfuscated_article(self, url): | ||
if url.startswith('http://news.ycombinator.com'): | ||
content = self.get_hn_content(url) | ||
else: | ||
# TODO: use content-type header instead of url | ||
is_image = False | ||
for ext in ['.jpg', '.png', '.svg', '.gif', '.jpeg', '.tiff', '.bmp',]: | ||
if url.endswith(ext): | ||
is_image = True | ||
break | ||
|
||
if is_image: | ||
self.log('using image_content (' + url + ')') | ||
content = u'<html><body><img src="' + url + u'"></body></html>' | ||
else: | ||
content = self.get_readable_content(url) | ||
|
||
self.temp_files.append(PersistentTemporaryFile('_fa.html')) | ||
self.temp_files[-1].write(content) | ||
self.temp_files[-1].close() | ||
return self.temp_files[-1].name | ||
|
||
def is_link_wanted(self, url, tag): | ||
if url.endswith('.pdf'): | ||
return False | ||
return True | ||
|
||
def prettyify_url(self, url): | ||
return urlparse(url).hostname | ||
|
||
def populate_article_metadata(self, article, soup, first): | ||
article.text_summary = self.prettyify_url(article.url) | ||
article.summary = article.text_summary | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
This code is under the Apache License 2.0. http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
This is a python port of a ruby port of arc90's readability project, taken | ||
from https://github.com/buriy/python-readability | ||
|
||
The original readability project: | ||
http://lab.arc90.com/experiments/readability/ | ||
|
||
In few words, | ||
Given a html document, it pulls out the main body text and cleans it up. | ||
It also can clean up title based on latest readability.js code. | ||
|
||
Based on: | ||
- Latest readability.js ( https://github.com/MHordecki/readability-redux/blob/master/readability/readability.js ) | ||
- Ruby port by starrhorne and iterationlabs | ||
- Python port by gfxmonk ( https://github.com/gfxmonk/python-readability , based on BeautifulSoup ) | ||
- Decruft effort to move to lxml ( http://www.minvolai.com/blog/decruft-arc90s-readability-in-python/ ) | ||
- "BR to P" fix from readability.js which improves quality for smaller texts. | ||
- Github users contributions. | ||
|
||
Installation:: | ||
|
||
easy_install readability-lxml | ||
or | ||
pip install readability-lxml | ||
|
||
Usage:: | ||
|
||
from readability.readability import Document | ||
import urllib | ||
html = urllib.urlopen(url).read() | ||
readable_article = Document(html).summary() | ||
readable_title = Document(html).short_title() | ||
|
||
Command-line usage:: | ||
|
||
python -m readability.readability -u http://pypi.python.org/pypi/readability-lxml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds | ||
import re | ||
from lxml.html.clean import Cleaner | ||
|
||
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] | ||
single_quoted = "'[^']+'" | ||
double_quoted = '"[^"]+"' | ||
non_space = '[^ "\'>]+' | ||
htmlstrip = re.compile("<" # open | ||
"([^>]+) " # prefix | ||
"(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes | ||
'= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value | ||
"([^>]*)" # postfix | ||
">" # end | ||
, re.I) | ||
|
||
def clean_attributes(html): | ||
while htmlstrip.search(html): | ||
html = htmlstrip.sub('<\\1\\2>', html) | ||
return html | ||
|
||
def normalize_spaces(s): | ||
if not s: return '' | ||
"""replace any sequence of whitespace | ||
characters with a single space""" | ||
return ' '.join(s.split()) | ||
|
||
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, | ||
style=True, links=True, meta=False, add_nofollow=False, | ||
page_structure=False, processing_instructions=True, embedded=False, | ||
frames=False, forms=False, annoying_tags=False, remove_tags=None, | ||
remove_unknown_tags=False, safe_attrs_only=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
def save_to_file(text, filename): | ||
f = open(filename, 'wt') | ||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') | ||
f.write(text.encode('utf-8')) | ||
f.close() | ||
|
||
uids = {} | ||
def describe(node, depth=2): | ||
if not hasattr(node, 'tag'): | ||
return "[%s]" % type(node) | ||
name = node.tag | ||
if node.get('id', ''): name += '#'+node.get('id') | ||
if node.get('class', ''): | ||
name += '.' + node.get('class').replace(' ','.') | ||
if name[:4] in ['div#', 'div.']: | ||
name = name[3:] | ||
if name in ['tr', 'td', 'div', 'p']: | ||
if not node in uids: | ||
uid = uids[node] = len(uids)+1 | ||
else: | ||
uid = uids.get(node) | ||
name += "%02d" % (uid) | ||
if depth and node.getparent() is not None: | ||
return name+' - '+describe(node.getparent(), depth-1) | ||
return name |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,103 @@ | ||
import re | ||
|
||
from lxml.html import tostring | ||
import lxml.html | ||
|
||
from calibre.ebooks.readability.cleaners import normalize_spaces, clean_attributes | ||
from calibre.ebooks.chardet import xml_to_unicode | ||
|
||
def build_doc(page): | ||
page_unicode = xml_to_unicode(page, strip_encoding_pats=True)[0] | ||
doc = lxml.html.document_fromstring(page_unicode) | ||
return doc | ||
|
||
def js_re(src, pattern, flags, repl): | ||
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) | ||
|
||
|
||
def normalize_entities(cur_title): | ||
entities = { | ||
u'\u2014':'-', | ||
u'\u2013':'-', | ||
u'—': '-', | ||
u'–': '-', | ||
u'\u00A0': ' ', | ||
u'\u00AB': '"', | ||
u'\u00BB': '"', | ||
u'"': '"', | ||
} | ||
for c, r in entities.iteritems(): | ||
if c in cur_title: | ||
cur_title = cur_title.replace(c, r) | ||
|
||
return cur_title | ||
|
||
def norm_title(title): | ||
return normalize_entities(normalize_spaces(title)) | ||
|
||
def get_title(doc): | ||
title = doc.find('.//title').text | ||
if not title: | ||
return '[no-title]' | ||
|
||
return norm_title(title) | ||
|
||
def add_match(collection, text, orig): | ||
text = norm_title(text) | ||
if len(text.split()) >= 2 and len(text) >= 15: | ||
if text.replace('"', '') in orig.replace('"', ''): | ||
collection.add(text) | ||
|
||
def shorten_title(doc): | ||
title = doc.find('.//title').text | ||
if not title: | ||
return '' | ||
|
||
title = orig = norm_title(title) | ||
|
||
candidates = set() | ||
|
||
for item in ['.//h1', './/h2', './/h3']: | ||
for e in list(doc.iterfind(item)): | ||
if e.text: | ||
add_match(candidates, e.text, orig) | ||
if e.text_content(): | ||
add_match(candidates, e.text_content(), orig) | ||
|
||
for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: | ||
for e in doc.cssselect(item): | ||
if e.text: | ||
add_match(candidates, e.text, orig) | ||
if e.text_content(): | ||
add_match(candidates, e.text_content(), orig) | ||
|
||
if candidates: | ||
title = sorted(candidates, key=len)[-1] | ||
else: | ||
for delimiter in [' | ', ' - ', ' :: ', ' / ']: | ||
if delimiter in title: | ||
parts = orig.split(delimiter) | ||
if len(parts[0].split()) >= 4: | ||
title = parts[0] | ||
break | ||
elif len(parts[-1].split()) >= 4: | ||
title = parts[-1] | ||
break | ||
else: | ||
if ': ' in title: | ||
parts = orig.split(': ') | ||
if len(parts[-1].split()) >= 4: | ||
title = parts[-1] | ||
else: | ||
title = orig.split(': ', 1)[1] | ||
|
||
if not 15 < len(title) < 150: | ||
return orig | ||
|
||
return title | ||
|
||
def get_body(doc): | ||
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] | ||
raw_html = unicode(tostring(doc.body or doc)) | ||
return clean_attributes(raw_html) | ||
|
Oops, something went wrong.