From b950d14b0cb051547c7da6f09046ddeaaf1300ef Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 25 Aug 2011 07:03:15 -0600 Subject: [PATCH] ... --- recipes/fairbanks_daily.recipe | 93 ++++++++++++++-------------------- 1 file changed, 39 insertions(+), 54 deletions(-) diff --git a/recipes/fairbanks_daily.recipe b/recipes/fairbanks_daily.recipe index 282925728e2a..fcf6873904f4 100644 --- a/recipes/fairbanks_daily.recipe +++ b/recipes/fairbanks_daily.recipe @@ -1,5 +1,3 @@ -#import re # Provides preprocess_regexps re.compile - from calibre.web.feeds.news import BasicNewsRecipe class FairbanksDailyNewsminer(BasicNewsRecipe): @@ -8,21 +6,28 @@ class FairbanksDailyNewsminer(BasicNewsRecipe): oldest_article = 7 max_articles_per_feed = 100 - description = ''''The voice of interior Alaska since 1903''' + description = 'The voice of interior Alaska since 1903' publisher = 'http://www.newsminer.com/' category = 'news, Alaska, Fairbanks' language = 'en' - #extra_css = ''' - # p{font-weight: normal;text-align: justify} - # ''' - + + # Make article titles, author and date bold, italic or small font. + # http://assets.matchbin.com/sites/635/stylesheets/newsminer.com.css + # (signature_line contains date, views, comments) + extra_css = ''' + .story_item_headline { font-size: medium; font-weight: bold; } + .story_item_author { font-size: small; font-style:italic; } + .signature_line { font-size: small; } + ''' + remove_javascript = True use_embedded_content = False no_stylesheets = True language = 'en' encoding = 'utf8' conversion_options = {'linearize_tables':True} - # TODO: I don't see any photos in my Mobi file with this masterhead_url! + + # TODO: The News-miner cover image seems a bit small. Can this be enlarged by 10-30%? masthead_url = 'http://d2uh5w9wm14i0w.cloudfront.net/sites/635/assets/top_masthead_-_menu_pic.jpg' @@ -31,6 +36,10 @@ class FairbanksDailyNewsminer(BasicNewsRecipe): # manual processing is needed to get just the "story_item_date updated" # (which contains the date). Everything else on this line is pretty much not needed. # + # Currently, you will see the following: + # | Aug 24, 2011 | 654 views | 6 | | + # (ie. 6 comments) + # # HTML line containing story_item_date: #
Aug 22, 2011 | 2370 views | 52 52 comments | 9 9 recommendations | email to a friend | print
@@ -40,73 +49,49 @@ class FairbanksDailyNewsminer(BasicNewsRecipe): #preprocess_regexps = [(re.compile(r']*addthis_separator*>'), lambda match: '') ] #preprocess_regexps = [(re.compile(r'span class="addthis_separator">|'), lambda match: '') ] - + #preprocess_regexps = [ # (re.compile(r'.*?', re.IGNORECASE | re.DOTALL), lambda match : ''), # ] - + #def get_browser(self): #def preprocess_html(soup, first_fetch): # date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})) # return + #preprocess_regexps = [(re.compile(r' |.*?', re.DOTALL), lambda m: '')] + - # Try to keep some tags - some might not be needed here keep_only_tags = [ - #date = self.tag_to_string(soup.find('span', attrs={'class':'story_item_date updated'})), - dict(name='div', attrs={'class':'hnews hentry item'}), + #dict(name='div', attrs={'class':'hnews hentry item'}), dict(name='div', attrs={'class':'story_item_headline entry-title'}), + #dict(name='div', attrs={'class':'story_item_author'}), #dict(name='span', attrs={'class':'story_item_date updated'}), + #dict(name='div', attrs={'class':'story_item_author'}), dict(name='div', attrs={'class':'full_story'}) ] - #remove_tags = [ - # dict(name='div', attrs={'class':'story_tools'}), - # dict(name='p', attrs={'class':'ad_label'}), - # ] - # Try to remove some bothersome tags remove_tags = [ + # Try getting rid of some signature_line (date line) stuff #dict(name='img', attrs={'alt'}), dict(name='img', attrs={'class':'dont_touch_me'}), dict(name='span', attrs={'class':'number_recommendations'}), #dict(name='div', attrs={'class':'signature_line'}), + + # Removes div within dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}), - dict(name='div', attrs={'class':['addthis_toolbox','addthis_default_style']}), - dict(name='span', attrs={'class':'addthis_separator'}), + dict(name='div', attrs={'class':'related_content'}), - dict(name='div', attrs={'class':'comments_container'}), - #dict(name='div', attrs={'class':'signature_line'}), - dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'}), dict(name='div', attrs={'id':'comments_container'}) ] - # This one works but only gets title, date and clips article content! - #remove_tags_after = [ - # dict(name='span', attrs={'class':'story_item_date updated'}) - # ] - - #remove_tags_after = [ - # dict(name='div', attrs={'class':'advertisement'}), - # ] - - # Try clipping tags before and after to prevent pulling img views/posts numbers after date? - #remove_tags_before = [ - # dict(name='span', attrs={'class':'story_item_date updated'}) - # ] - - #extra_css # tweak the appearance # TODO: Change article titles to bold? - - # Comment-out or uncomment any of the following RSS feeds according to your # liking. # - # TODO: Adding more then one RSS Feed, and newline will be omitted for - # entries within the Table of Contents or Index of Articles - # - # TODO: Some random bits of text is trailing the last page (or TOC on MOBI - # files), these are bits of public posts and comments and need to also be - # removed. + # TODO: Some random bits of text might be trailing the last page (or TOC on + # MOBI files), these are bits of public posts and comments and need to also + # be removed. # feeds = [ (u'Alaska News', u'http://newsminer.com/rss/rss_feeds/alaska_news?content_type=article&tags=alaska_news&page_name=rss_feeds&instance=alaska_news'), @@ -114,15 +99,15 @@ class FairbanksDailyNewsminer(BasicNewsRecipe): (u'Business', u'http://newsminer.com/rss/rss_feeds/business_news?content_type=article&tags=business_news&page_name=rss_feeds&instance=business_news'), (u'Politics', u'http://newsminer.com/rss/rss_feeds/politics_news?content_type=article&tags=politics_news&page_name=rss_feeds&instance=politics_news'), (u'Sports', u'http://newsminer.com/rss/rss_feeds/sports_news?content_type=article&tags=sports_news&page_name=rss_feeds&instance=sports_news'), - # (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), + (u'Latitude 65 feed', u'http://newsminer.com/rss/rss_feeds/latitude_65?content_type=article&tags=latitude_65&page_name=rss_feeds&offset=0&instance=latitude_65'), (u'Sundays', u'http://newsminer.com/rss/rss_feeds/Sundays?content_type=article&tags=alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Sundays'), - # (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'), - # (u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), + (u'Outdoors', u'http://newsminer.com/rss/rss_feeds/Outdoors?content_type=article&tags=outdoors&page_name=rss_feeds&instance=Outdoors'), + #(u'Fairbanks Grizzlies', u'http://newsminer.com/rss/rss_feeds/fairbanks_grizzlies?content_type=article&tags=fairbanks_grizzlies&page_name=rss_feeds&instance=fairbanks_grizzlies'), (u'Newsminer', u'http://newsminer.com/rss/rss_feeds/Newsminer?content_type=article&tags=ted_stevens_bullets+ted_stevens+sports_news+business_news+fairbanks_grizzlies+dermot_cole_column+outdoors+alaska_science_forum+scott_mccrea+interior_gardening+in_the_bush+judy_ferguson+book_reviews+theresa_bakker+judith_kleinfeld+interior_scrapbook+nuggets_comics+freeze_frame&page_name=rss_feeds&tag_inclusion=or&instance=Newsminer'), - # (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'), - # (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'), - # (u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), - # (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), - (u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') + (u'Opinion', u'http://newsminer.com/rss/rss_feeds/Opinion?content_type=article&tags=editorials&page_name=rss_feeds&instance=Opinion'), + (u'Youth', u'http://newsminer.com/rss/rss_feeds/Youth?content_type=article&tags=youth&page_name=rss_feeds&instance=Youth'), + #(u'Dermot Cole Blog', u'http://newsminer.com/rss/rss_feeds/dermot_cole_blog+rss?content_type=blog+entry&sort_by=posted_on&user_ids=3015275&page_name=blogs_dermot_cole&limit=10&instance=dermot_cole_blog+rss'), + (u'Dermot Cole Column', u'http://newsminer.com/rss/rss_feeds/Dermot_Cole_column?content_type=article&tags=dermot_cole_column&page_name=rss_feeds&instance=Dermot_Cole_column'), + #(u'Sarah Palin', u'http://newsminer.com/rss/rss_feeds/sarah_palin?content_type=article&tags=palin_in_the_news+palin_on_the_issues&page_name=rss_feeds&tag_inclusion=or&instance=sarah_palin') ]