diff --git a/Changelog.txt b/Changelog.txt index ba0dbbf31389..351c8a5e6a0a 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -23,6 +23,36 @@ # - title by author # }}} +{{{ 7.9.0 2024-04-19 + +:: new features + +- [2060886] Kobo driver: Add support for the new color Kobo devices + +- Edit book: Add a setting to control cursor width under Preferences->Editor settings + +- Edit book: Regex-function mode: Show a confirmation popup when closing the function editor when there are unsaved changes + +:: bug fixes + +- [2060314] Fix undocked Quickview dialog not being restored at startup + +- [2044118] Windows: Fix an issue where closing a maximized calibre window to the system tray and then reconnecting with remote desktop would cause a blank calibre window to be displayed + +:: improved recipes +- El Correo +- Eenadu +- ORFonline +- NatGeo +- Harpers Magazine +- New Yorker +- Business Today +- The Week +- Asahi Shimbun +- Outlook Magazine + +}}} + {{{ 7.8.0 2024-04-05 :: new features diff --git a/bypy/windows/XUnzip.cpp b/bypy/windows/XUnzip.cpp index 40de3b2cec22..29a7f235d0c8 100644 --- a/bypy/windows/XUnzip.cpp +++ b/bypy/windows/XUnzip.cpp @@ -3072,7 +3072,7 @@ unzFile unzOpenInternal(LUFILE *fin) uLong number_disk; // number of the current dist, used for spanning ZIP, unsupported, always 0 if (unzlocal_getShort(fin,&number_disk)!=UNZ_OK) err=UNZ_ERRNO; // number of the disk with the start of the central directory - uLong number_disk_with_CD; // number the the disk with central dir, used for spanning ZIP, unsupported, always 0 + uLong number_disk_with_CD; // number the disk with central dir, used for spanning ZIP, unsupported, always 0 if (unzlocal_getShort(fin,&number_disk_with_CD)!=UNZ_OK) err=UNZ_ERRNO; // total number of entries in the central dir on this disk if (unzlocal_getShort(fin,&us.gi.number_entry)!=UNZ_OK) err=UNZ_ERRNO; diff --git a/bypy/windows/wix.py b/bypy/windows/wix.py index 5c93e21c4bfe..6eb2f88018d1 100644 --- a/bypy/windows/wix.py +++ b/bypy/windows/wix.py @@ -19,6 +19,11 @@ j, d, a, b = os.path.join, os.path.dirname, os.path.abspath, os.path.basename +def add_wix_extension(name): + if not os.path.exists(os.path.expanduser(f'~/.wix/extensions/{name}')): + run(WIX, 'extension', 'add', '-g', name) + + def create_installer(env, compression_level='9'): cl = int(compression_level) if cl > 4: @@ -62,8 +67,8 @@ def create_installer(env, compression_level='9'): arch = 'x64' if is64bit else 'x86' installer = j(env.dist, '%s%s-%s.msi' % ( calibre_constants['appname'], ('-64bit' if is64bit else ''), calibre_constants['version'])) - run(WIX, 'extension', 'add', '-g', 'WixToolset.Util.wixext') - run(WIX, 'extension', 'add', '-g', 'WixToolset.UI.wixext') + add_wix_extension('WixToolset.Util.wixext') + add_wix_extension( 'WixToolset.UI.wixext') cmd = [WIX, 'build', '-arch', arch, '-culture', 'en-us', '-loc', enusf, '-dcl', dcl, '-ext', 'WixToolset.Util.wixext', '-ext', 'WixToolset.UI.wixext', '-o', installer, wxsf] run(*cmd) diff --git a/format_docs/pdb/pdb_format.txt b/format_docs/pdb/pdb_format.txt index 77fcba87db37..9d8851b7e873 100644 --- a/format_docs/pdb/pdb_format.txt +++ b/format_docs/pdb/pdb_format.txt @@ -3,7 +3,7 @@ Format A PDB file can be broken into multiple parts. The header, record 0 and data. values stored within the various parts are big-endian byte order. The data -part is is broken down into multiple sections. The section count and offsets +part is broken down into multiple sections. The section count and offsets are referenced in the PDB header. Sections can be no more than 65505 bytes in length. diff --git a/recipes/asahi_shimbun_en.recipe b/recipes/asahi_shimbun_en.recipe index d559901de7c2..7cdbfbbc8b47 100644 --- a/recipes/asahi_shimbun_en.recipe +++ b/recipes/asahi_shimbun_en.recipe @@ -153,7 +153,7 @@ class AsahiShimbunEnglishNews(BasicNewsRecipe): ("Asia & World - World", self.get_section("asia_world/world")), ("Sci & Tech", self.get_section("sci_tech")), ("Culture - Style", self.get_section("culture/style")), - ("Culture - Cooking", self.get_section("culture/cooking")), + # ("Culture - Cooking", self.get_section("culture/cooking")), ("Culture - Movies", self.get_section("culture/movies")), ("Culture - Manga & Anime", self.get_section("culture/manga_anime")), ("Travel", self.get_section("travel")), diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe index 0a5d08e7f564..fd833aa2f5bb 100644 --- a/recipes/barrons.recipe +++ b/recipes/barrons.recipe @@ -77,10 +77,14 @@ class barrons(BasicNewsRecipe): return br def parse_index(self): + self.log( + '\n***\nif this recipe fails, report it on: ' + 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' + ) archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y')) issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--')) self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']' - self.log(self.timefmt) + self.description = self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--headline--'))) self.cover_url = issue.img['src'].split('?')[0] ans = defaultdict(list) diff --git a/recipes/business_today.recipe b/recipes/business_today.recipe index 44c5705ba379..1ea2f2b3495c 100644 --- a/recipes/business_today.recipe +++ b/recipes/business_today.recipe @@ -74,7 +74,7 @@ class BT(BasicNewsRecipe): # Insert feeds in specified order, if available - feedSort = ['Editor\'s Note'] + feedSort = ['Editor\'s Note', 'Editors note'] for i in feedSort: if i in sections: feeds.append((i, sections[i])) @@ -82,7 +82,8 @@ class BT(BasicNewsRecipe): # Done with the sorted feeds for i in feedSort: - del sections[i] + if i in sections: + del sections[i] # Append what is left over... diff --git a/recipes/caravan_magazine.recipe b/recipes/caravan_magazine.recipe index 7896deda7261..c271c3a92c14 100644 --- a/recipes/caravan_magazine.recipe +++ b/recipes/caravan_magazine.recipe @@ -92,8 +92,8 @@ class CaravanMagazine(BasicNewsRecipe): 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n' ) api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue' - # https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input= - # %7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A 2 %2C%22year%22%3A 2024 %7D%7D%7D + # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&' + \ + # 'input=%7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A' + '2' + '%2C%22year%22%3A' + '2024' + '%7D%7D%7D' # input={"0":{"json":{"month":2,"year":2024}}} raw = self.index_to_soup(api, raw=True) data = json.loads(raw)['result']['data']['json'] diff --git a/recipes/eenadu.recipe b/recipes/eenadu.recipe index cebec9e5a2c8..893dec36e064 100644 --- a/recipes/eenadu.recipe +++ b/recipes/eenadu.recipe @@ -1,7 +1,5 @@ -import re -from datetime import date, datetime, timedelta +from urllib.parse import quote -from calibre.utils.date import parse_date from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -10,118 +8,116 @@ class eenadu_ts(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' - use_embedded_content = False - remove_javascript = True + encoding = 'utf-8' no_stylesheets = True - remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url', 'title'} + remove_javascript = True masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/08/GTH/5_01/d5041804_01_mr.jpg' - encoding = 'utf-8' + remove_attributes = ['style', 'height', 'width'] + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True remove_empty_feeds = True - extra_css = '.pub-t{font-size:small; font-style:italic;}' + simultaneous_downloads = 1 + art_url = '' - keep_only_tags = [ - dict(name='h1'), - dict(**classes('pub-t')), - classes('fullstory text-justify contlist-cont'), - dict(name='span', attrs={'id': 'PDSAIApbreak'}), - ] + extra_css = ''' + img {display:block; margin:0 auto;} + blockquote, em {color:#202020;} + .pub-t{font-size:small; font-style:italic;} + ''' - remove_tags = [ - dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), - dict( - name='p', - attrs={ - 'style': - 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' - } - ), - dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), - dict(name='br'), - classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') - ] + keep_only_tags = [classes('bookWrapper fullstory')] + remove_tags = [classes('ext-link offset-tb1 sshare-c')] + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links') + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) - def parse_index(self): - section_list = [ - ('తెలంగాణ తాజా వార్తలు', 'telangana'), - ('సంపాదకీయం', 'telangana/editorial'), - ('తెలంగాణ ప్రధానాంశాలు', 'telangana/top-news'), - ('తెలంగాణ జిల్లా వార్తలు', 'telangana/districts'), - # ('క్రైమ్', 'crime'), + resolve_internal_links = True + remove_empty_feeds = True + + def get_cover_url(self): + import json + from datetime import date + today = quote(date.today().strftime('%d/%m/%Y'), safe='') + raw = self.index_to_soup( + 'https://epaper.eenadu.net/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True + ) + for cov in json.loads(raw): + if cov['NewsProPageTitle'].lower().startswith('front'): + return cov['HighResolution'] + + feeds = [] + + when = '27' # hours + index = 'https://www.eenadu.net' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te' + + news = index + '/telugu-news/' + news_list = [ + ('తెలంగాణ ప్రధానాంశాలు', 'ts-top-news'), + ('సంపాదకీయం', 'editorial'), + ('వ్యాఖ్యానం', 'vyakyanam'), + ('హైదరాబాద్ జిల్లా వార్తలు', 'districts/Hyderabad'), + ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), - # ('సినిమా', 'movies'), - # ('చదువు', 'education'), - # ('సుఖీభవ', 'health'), - # ('ఈ-నాడు', 'technology'), - # ('మకరందం', 'devotional'), - # ('ఈ తరం', 'youth'), - # ('ఆహా', 'recipes'), - # ('హాయ్ బుజ్జీ', 'kids-stories'), - # ('స్థిరాస్తి', 'real-estate'), + ('సినిమా', 'movies'), + ('వసుంధర', 'women'), + ('ఈ-నాడు', 'technology'), + ('వెబ్ ప్రత్యేకం', 'explained') ] - is_sunday = date.today().weekday() == 6 - if is_sunday: - section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) - feeds = [] - - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = 'https://www.eenadu.net/' + section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds - - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll( - attrs={ - 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] - } - ): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url - - try: - desc = self.tag_to_string(a.find('div')).strip() - except Exception: - desc = '' - - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3).strip() - sub = re.escape(title) - desc = re.sub(sub, '', desc).strip() - - if not title or not url: - continue - - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans - - def preprocess_html(self, soup): - div = soup.find('div', **classes('pub-t')) - if div: - date = parse_date( - self.tag_to_string(div).strip().replace('Published : ', '').replace( - 'Updated : ', '' - ).replace(' IST', ':00.000001') - ).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(1.15): - self.abort_article('Skipping old article') - else: - self.abort_article('not an article') - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - return soup + for n in news_list: + news_index = news + n[1] + '/' + feeds.append((n[0], a.format(when, quote(news_index, safe='')))) + feeds.append(('Other News', a.format(when, quote(news, safe='')))) + + + art = index + '/telugu-article/' + art_list = [ + ('చదువు', 'education'), + ('సుఖీభవ', 'health'), + ('ఆహా', 'recipes'), + ('హాయ్ బుజ్జీ', 'kids-stories'), + ('మకరందం', 'devotional'), + ('దేవతార్చన', 'temples'), + ('స్థిరాస్తి', 'real-estate'), + ('కథామృతం', 'kathalu'), + ('సండే మ్యాగజైన్', 'sunday-magazine') + ] + for x in art_list: + art_index = art + x[1] + '/' + feeds.append((x[0], a.format(when, quote(art_index, safe='')))) + feeds.append(('Other Articles', a.format(when, quote(art, safe='')))) + + feeds.append(('ఇతరులు', a.format(when, quote(index, safe='')))) + feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/'))) + + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Eenadu', '') + desc = soup.find(attrs={'class':'srtdes'}) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary + + def preprocess_raw_html(self, raw, *a): + import re + if '' in raw: + body = re.search(r'([^~]+?)', raw) + return '
' + body.group(1) + '
' + return raw diff --git a/recipes/eenadu_ap.recipe b/recipes/eenadu_ap.recipe index 4c7d8d0a2e42..9d775950420f 100644 --- a/recipes/eenadu_ap.recipe +++ b/recipes/eenadu_ap.recipe @@ -1,7 +1,5 @@ -import re -from datetime import date, datetime, timedelta +from urllib.parse import quote -from calibre.utils.date import parse_date from calibre.web.feeds.news import BasicNewsRecipe, classes @@ -10,137 +8,116 @@ class eenadu_ap(BasicNewsRecipe): __author__ = 'unkn0wn' description = 'THE LARGEST CIRCULATED TELUGU DAILY' language = 'te' - use_embedded_content = False - remove_javascript = True + encoding = 'utf-8' no_stylesheets = True - remove_attributes = ['height', 'width', 'style'] - ignore_duplicate_articles = {'url', 'title'} + remove_javascript = True masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png' - cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/03/CAN/5_01/bfff5654_01_mr.jpg' - encoding = 'utf-8' + remove_attributes = ['style', 'height', 'width'] + ignore_duplicate_articles = {'url', 'title'} + reverse_article_order = True remove_empty_feeds = True - extra_css = '.pub-t{font-size:small; font-style:italic;}' + simultaneous_downloads = 1 + art_url = '' - keep_only_tags = [ - dict(name='h1'), - dict(**classes('pub-t')), - classes('fullstory text-justify contlist-cont'), - dict(name='span', attrs={'id': 'PDSAIApbreak'}), - ] + extra_css = ''' + img {display:block; margin:0 auto;} + blockquote, em {color:#202020;} + .pub-t{font-size:small; font-style:italic;} + ''' - remove_tags = [ - dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}), - dict( - name='p', - attrs={ - 'style': - 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;' - } - ), - dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}), - dict(name='br'), - classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list') - ] + keep_only_tags = [classes('bookWrapper fullstory')] + remove_tags = [classes('ext-link offset-tb1 sshare-c')] + + articles_are_obfuscated = True + + def get_obfuscated_article(self, url): + br = self.get_browser() + soup = self.index_to_soup(url) + link = soup.a['href'] + skip_sections =[ # add sections you want to skip + '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla' + ] + if any(x in link for x in skip_sections): + self.abort_article('skipping video links') + self.log('Found ', link) + self.art_url = link + html = br.open(link).read() + return ({ 'data': html, 'url': link }) + + resolve_internal_links = True + remove_empty_feeds = True def get_cover_url(self): + import json from datetime import date - cover = 'https://img.kiosko.net/' + str( - date.today().year - ) + '/' + date.today().strftime('%m') + '/' + date.today( - ).strftime('%d') + '/in/eenadu.750.jpg' - br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False) - try: - br.open(cover) - except: - index = 'https://es.kiosko.net/in/np/eenadu.html' - soup = self.index_to_soup(index) - for image in soup.findAll('img', src=True): - if image['src'].endswith('750.jpg'): - return 'https:' + image['src'] - self.log("\nCover unavailable") - cover = None - return cover - - def parse_index(self): - section_list = [ - ('ఆంధ్రప్రదేశ్ తాజా వార్తలు', 'andhra-pradesh'), - ('సంపాదకీయం', 'andhra-pradesh/editorial'), - ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'andhra-pradesh/top-news'), - ('ఆంధ్రప్రదేశ్ జిల్లా వార్తలు', 'andhra-pradesh/districts'), - # ('క్రైమ్', 'crime'), + today = quote(date.today().strftime('%d/%m/%Y'), safe='') + raw = self.index_to_soup( + 'https://epaper.eenadu.net/Home/GetAllpages?editionid=2&editiondate=' + today, raw=True + ) + for cov in json.loads(raw): + if cov['NewsProPageTitle'].lower().startswith('front'): + return cov['HighResolution'] + + feeds = [] + + when = '27' # hours + index = 'https://www.eenadu.net' + a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te' + + news = index + '/telugu-news/' + news_list = [ + ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'ap-top-news'), + ('సంపాదకీయం', 'editorial'), + ('వ్యాఖ్యానం', 'vyakyanam'), + ('విశాఖపట్నం జిల్లా వార్తలు', 'districts/Visakhapatnam'), + ('క్రైమ్', 'crime'), ('పాలిటిక్స్', 'politics'), ('జాతీయం', 'india'), ('బిజినెస్', 'business'), ('అంతర్జాతీయం', 'world'), ('క్రీడలు', 'sports'), - # ('సినిమా', 'movies'), - # ('చదువు', 'education'), - # ('సుఖీభవ', 'health'), - # ('ఈ-నాడు', 'technology'), - # ('మకరందం', 'devotional'), - # ('ఈ తరం', 'youth'), - # ('ఆహా', 'recipes'), - # ('హాయ్ బుజ్జీ', 'kids-stories'), - # ('స్థిరాస్తి', 'real-estate'), + ('సినిమా', 'movies'), + ('వసుంధర', 'women'), + ('ఈ-నాడు', 'technology'), + ('వెబ్ ప్రత్యేకం', 'explained') ] - is_sunday = date.today().weekday() == 6 - if is_sunday: - section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine')) - feeds = [] - - # For each section title, fetch the article urls - for section in section_list: - section_title = section[0] - section_url = 'https://www.eenadu.net/' + section[1] - self.log(section_title, section_url) - soup = self.index_to_soup(section_url) - articles = self.articles_from_soup(soup) - if articles: - feeds.append((section_title, articles)) - return feeds - - def articles_from_soup(self, soup): - ans = [] - for link in soup.findAll( - attrs={ - 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel'] - } - ): - for a in link.findAll('a', attrs={'href': True}): - url = a['href'] - if url.startswith('https') is False: - url = 'https://www.eenadu.net/' + url - - try: - desc = self.tag_to_string(a.find('div')).strip() - except Exception: - desc = '' - - for h3 in a.findAll('h3'): - title = self.tag_to_string(h3).strip() - sub = re.escape(title) - desc = re.sub(sub, '', desc).strip() - - if not title or not url: - continue - - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - ans.append({'title': title, 'url': url, 'description': desc}) - return ans - - def preprocess_html(self, soup): - div = soup.find('div', **classes('pub-t')) - if div: - date = parse_date( - self.tag_to_string(div).strip().replace('Published : ', '').replace( - 'Updated : ', '' - ).replace(' IST', ':00.000001') - ).replace(tzinfo=None) - today = datetime.now() - if (today - date) > timedelta(1.15): - self.abort_article('Skipping old article') - else: - self.abort_article('not an article') - for img in soup.findAll('img', attrs={'data-src': True}): - img['src'] = img['data-src'] - return soup + for n in news_list: + news_index = news + n[1] + '/' + feeds.append((n[0], a.format(when, quote(news_index, safe='')))) + feeds.append(('Other News', a.format(when, quote(news, safe='')))) + + + art = index + '/telugu-article/' + art_list = [ + ('చదువు', 'education'), + ('సుఖీభవ', 'health'), + ('ఆహా', 'recipes'), + ('హాయ్ బుజ్జీ', 'kids-stories'), + ('మకరందం', 'devotional'), + ('దేవతార్చన', 'temples'), + ('స్థిరాస్తి', 'real-estate'), + ('కథామృతం', 'kathalu'), + ('సండే మ్యాగజైన్', 'sunday-magazine') + ] + for x in art_list: + art_index = art + x[1] + '/' + feeds.append((x[0], a.format(when, quote(art_index, safe='')))) + feeds.append(('Other Articles', a.format(when, quote(art, safe='')))) + + feeds.append(('ఇతరులు', a.format(when, quote(index, safe='')))) + feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/'))) + + def populate_article_metadata(self, article, soup, first): + article.url = self.art_url + article.title = article.title.replace(' - Eenadu', '') + desc = soup.find(attrs={'class':'srtdes'}) + if desc: + article.summary = self.tag_to_string(desc) + article.text_summary = article.summary + + def preprocess_raw_html(self, raw, *a): + import re + if '' in raw: + body = re.search(r'([^~]+?)', raw) + return '
' + body.group(1) + '
' + return raw diff --git a/recipes/el_correo.recipe b/recipes/el_correo.recipe index 7fa3f54b9f8e..4f03835d4684 100644 --- a/recipes/el_correo.recipe +++ b/recipes/el_correo.recipe @@ -1,184 +1,114 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '08 Januery 2011, desUBIKado' -__author__ = 'desUBIKado' -__description__ = 'Daily newspaper from Biscay' -__version__ = 'v0.14' -__date__ = '10, September 2017' ''' http://www.elcorreo.com/ ''' -import re -import time - -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes class elcorreo(BasicNewsRecipe): - __author__ = 'desUBIKado' - description = 'Daily newspaper from Biscay' - title = u'El Correo' - publisher = 'Vocento' - category = 'News, politics, culture, economy, general interest' - oldest_article = 1 - delay = 1 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - masthead_url = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png' + title = 'El Correo' + __author__ = 'unkn0wn' + description = 'Daily newspaper in Bilbao and the Basque Country of northern Spain' + oldest_article = 1 # days language = 'es' - timefmt = '[%a, %d %b, %Y]' + no_stylesheets = True + remove_attributes = ['height', 'width', 'style'] + ignore_duplicate_articles = {'url'} + encoding = 'utf-8' + masthead_url = 'https://s1.ppllstatics.com/starfish/1.3.76/assets/images/logos/logo-elcorreo.svg' encoding = 'utf-8' remove_empty_feeds = True - remove_javascript = True + resolve_internal_links = True - feeds = [ - (u'Portada', u'http://www.elcorreo.com/rss/atom/portada'), - (u'Mundo', u'http://www.elcorreo.com/rss/atom/?section=internacional'), - (u'Bizkaia', u'http://www.elcorreo.com/rss/atom/?section=bizkaia'), - (u'Guipuzkoa', u'http://www.elcorreo.com/rss/atom/?section=gipuzkoa'), - (u'Araba', u'http://www.elcorreo.com/rss/atom/?section=araba'), - (u'La Rioja', u'http://www.elcorreo.com/rss/atom/?section=larioja'), - (u'Miranda', u'http://www.elcorreo.com/rss/atom/?section=miranda'), - (u'Economía', u'http://www.elcorreo.com/rss/atom/?section=economia'), - (u'Culturas', u'http://www.elcorreo.com/rss/atom/?section=culturas'), - (u'Politica', u'http://www.elcorreo.com/rss/atom/?section=politica'), - (u'Tecnología', u'http://www.elcorreo.com/rss/atom/?section=tecnologia'), - (u'Gente - Estilo', u'http://www.elcorreo.com/rss/atom/?section=gente-estilo'), - (u'Planes', u'http://www.elcorreo.com/rss/atom/?section=planes'), - (u'Athletic', u'http://www.elcorreo.com/rss/atom/?section=athletic'), - (u'Alavés', u'http://www.elcorreo.com/rss/atom/?section=alaves'), - (u'Bilbao Basket', u'http://www.elcorreo.com/rss/atom/?section=bilbaobasket'), - (u'Baskonia', u'http://www.elcorreo.com/rss/atom/?section=baskonia'), - (u'Deportes', u'http://www.elcorreo.com/rss/atom/?section=deportes'), - (u'Jaiak', u'http://www.elcorreo.com/rss/atom/?section=jaiak'), - (u'La Blanca', u'http://www.elcorreo.com/rss/atom/?section=la-blanca-vitoria'), - (u'Aste Nagusia', u'http://www.elcorreo.com/rss/atom/?section=aste-nagusia-bilbao'), - (u'Semana Santa', u'http://www.elcorreo.com/rss/atom/?section=semana-santa'), - (u'Festivales', u'http://www.elcorreo.com/rss/atom/?section=festivales') - ] + extra_css = ''' + .v-mdl-ath__inf, .v-mdl-ath__p--2, .v-mdl-ath__p {font-size:small; color:#404040;} + .v-fc, .v-a-fig { text-align:center; font-size:small; } + #sub { font-style:italic; color:#202020; } + blockquote, em { color:#202020; } + img { display:block; margin:0 auto; } + ''' + + def get_cover_url(self): + from datetime import date + return 'https://portada.iperiodico.es/' + date.today().strftime('%Y/%m/%d') + '_elcorreo.750.jpg' keep_only_tags = [ - dict(name='div', attrs={'class': ['col-xs-12 col-sm-12 col-md-8 col-lg-8']}) + dict(name='article', attrs={'class': lambda x: x and set(x.split()).intersection( + {'v-a--d-bs', 'v-a--d-opn', 'v-a--d-rpg'})}), + classes( + 'v-d--ab-c v-d--rpg' + ) ] remove_tags = [ - dict( - name='div', - attrs={ - 'class': [ - 'voc-topics voc-detail-grid ', 'voc-newsletter ', - 'voc-author-social' - ] - } - ), - dict(name='section', attrs={'class': ['voc-ficha-detail voc-file-sports']}) + dict(name = ['svg', 'section', 'nav']), + dict(attrs={'data-voc-component':['dropdown', 'modal', 'slider-grab']}), + classes( + 'v-mdl-ath__img-c v-adv v-i-b v-mdl-ath__c--2 v-d-cmp-adv v-d-cmp-nws ' + 'v-pill-m--zoom v-stk-adv slider-grab g-artboard v-d-cmp-rld v-pill-m--glly' + ) ] - remove_tags_before = dict( - name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-8 col-lg-8'} - ) - remove_tags_after = dict( - name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-8 col-lg-8'} - ) - - _processed_links = [] - - def get_article_url(self, article): - link = article.get('link', None) - - if link is None: - return article - - # modificamos la url de las noticias de los equipos deportivos para que funcionen, por ejemplo: - # http://athletic.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html - # http://m.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html?external=deportes/athletic - - parte = link.split('/') - - if parte[2] == 'athletic.elcorreo.com': - link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[ - 4 - ] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/athletic' - else: - if parte[2] == 'baskonia.elcorreo.com': - link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[ - 4 - ] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/baskonia' + def preprocess_html(self, soup): + art = soup.find('article') + h1 = soup.find('h1') + h2 = soup.find('h2') + if h1 and art: + art.insert_before(h1) + if h2 and h1: + h1.insert_after(h2) + h2.name = 'p' + h2['id'] = 'sub' + for but in soup.findAll('button'): + if but.find('img'): + but.name = 'div' else: - if parte[2] == 'bilbaobasket.elcorreo.com': - link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[ - 4 - ] + '/' + parte[5] + '/' + parte[ - 6 - ] + '?external=deportes/bilbaobasket' - else: - if parte[2] == 'alaves.elcorreo.com': - link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[ - 4 - ] + '/' + parte[5] + '/' + parte[ - 6 - ] + '?external=deportes/alaves' - - # A veces el mismo articulo aparece en la versión de Alava y en la de Bizkaia. Por ejemplo: - # http://www.elcorreo.com/alava/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html - # http://www.elcorreo.com/bizkaia/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html - # para controlar los duplicados, unificamos las url para que sean siempre de bizkaia (excepto para la sección "araba") - - if ((parte[3] == 'alava') and (parte[4] != 'araba')): - link = link.replace('elcorreo.com/alava', 'elcorreo.com/bizkaia') - - # Controlamos si el artículo ha sido incluido en otro feed para eliminarlo - - if link not in self._processed_links: - self._processed_links.append(link) - else: - link = None - - return link - - # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion) - - def get_cover_url(self): - cover = None - st = time.localtime() - year = str(st.tm_year) - month = "%.2d" % st.tm_mon - day = "%.2d" % st.tm_mday - # http://info.elcorreo.com/pdf/07082013-viz.pdf - cover = 'http://info.elcorreo.com/pdf/' + day + month + year + '-viz.pdf' - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - self.log("\nPortada no disponible") - cover = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png' - return cover - - # Para cambiar el estilo del texto - - extra_css = ''' - h1 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;} - h2 {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:16px;color:#4D4D4D;} - h3 {font-family:georgia,serif; font-weight:bold;font-size:18px;} - ''' - - preprocess_regexps = [ - - # Para presentar la imagen de los video incrustados - ( - re.compile(r'stillURLVideo: \'', re.DOTALL | re.IGNORECASE), - lambda match: '