diff --git a/Changelog.txt b/Changelog.txt
index ba0dbbf31389..351c8a5e6a0a 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -23,6 +23,36 @@
# - title by author
# }}}
+{{{ 7.9.0 2024-04-19
+
+:: new features
+
+- [2060886] Kobo driver: Add support for the new color Kobo devices
+
+- Edit book: Add a setting to control cursor width under Preferences->Editor settings
+
+- Edit book: Regex-function mode: Show a confirmation popup when closing the function editor when there are unsaved changes
+
+:: bug fixes
+
+- [2060314] Fix undocked Quickview dialog not being restored at startup
+
+- [2044118] Windows: Fix an issue where closing a maximized calibre window to the system tray and then reconnecting with remote desktop would cause a blank calibre window to be displayed
+
+:: improved recipes
+- El Correo
+- Eenadu
+- ORFonline
+- NatGeo
+- Harpers Magazine
+- New Yorker
+- Business Today
+- The Week
+- Asahi Shimbun
+- Outlook Magazine
+
+}}}
+
{{{ 7.8.0 2024-04-05
:: new features
diff --git a/bypy/windows/XUnzip.cpp b/bypy/windows/XUnzip.cpp
index 40de3b2cec22..29a7f235d0c8 100644
--- a/bypy/windows/XUnzip.cpp
+++ b/bypy/windows/XUnzip.cpp
@@ -3072,7 +3072,7 @@ unzFile unzOpenInternal(LUFILE *fin)
uLong number_disk; // number of the current dist, used for spanning ZIP, unsupported, always 0
if (unzlocal_getShort(fin,&number_disk)!=UNZ_OK) err=UNZ_ERRNO;
// number of the disk with the start of the central directory
- uLong number_disk_with_CD; // number the the disk with central dir, used for spanning ZIP, unsupported, always 0
+ uLong number_disk_with_CD; // number the disk with central dir, used for spanning ZIP, unsupported, always 0
if (unzlocal_getShort(fin,&number_disk_with_CD)!=UNZ_OK) err=UNZ_ERRNO;
// total number of entries in the central dir on this disk
if (unzlocal_getShort(fin,&us.gi.number_entry)!=UNZ_OK) err=UNZ_ERRNO;
diff --git a/bypy/windows/wix.py b/bypy/windows/wix.py
index 5c93e21c4bfe..6eb2f88018d1 100644
--- a/bypy/windows/wix.py
+++ b/bypy/windows/wix.py
@@ -19,6 +19,11 @@
j, d, a, b = os.path.join, os.path.dirname, os.path.abspath, os.path.basename
+def add_wix_extension(name):
+ if not os.path.exists(os.path.expanduser(f'~/.wix/extensions/{name}')):
+ run(WIX, 'extension', 'add', '-g', name)
+
+
def create_installer(env, compression_level='9'):
cl = int(compression_level)
if cl > 4:
@@ -62,8 +67,8 @@ def create_installer(env, compression_level='9'):
arch = 'x64' if is64bit else 'x86'
installer = j(env.dist, '%s%s-%s.msi' % (
calibre_constants['appname'], ('-64bit' if is64bit else ''), calibre_constants['version']))
- run(WIX, 'extension', 'add', '-g', 'WixToolset.Util.wixext')
- run(WIX, 'extension', 'add', '-g', 'WixToolset.UI.wixext')
+ add_wix_extension('WixToolset.Util.wixext')
+ add_wix_extension( 'WixToolset.UI.wixext')
cmd = [WIX, 'build', '-arch', arch, '-culture', 'en-us', '-loc', enusf, '-dcl', dcl,
'-ext', 'WixToolset.Util.wixext', '-ext', 'WixToolset.UI.wixext', '-o', installer, wxsf]
run(*cmd)
diff --git a/format_docs/pdb/pdb_format.txt b/format_docs/pdb/pdb_format.txt
index 77fcba87db37..9d8851b7e873 100644
--- a/format_docs/pdb/pdb_format.txt
+++ b/format_docs/pdb/pdb_format.txt
@@ -3,7 +3,7 @@ Format
A PDB file can be broken into multiple parts. The header, record 0 and data.
values stored within the various parts are big-endian byte order. The data
-part is is broken down into multiple sections. The section count and offsets
+part is broken down into multiple sections. The section count and offsets
are referenced in the PDB header. Sections can be no more than 65505 bytes in
length.
diff --git a/recipes/asahi_shimbun_en.recipe b/recipes/asahi_shimbun_en.recipe
index d559901de7c2..7cdbfbbc8b47 100644
--- a/recipes/asahi_shimbun_en.recipe
+++ b/recipes/asahi_shimbun_en.recipe
@@ -153,7 +153,7 @@ class AsahiShimbunEnglishNews(BasicNewsRecipe):
("Asia & World - World", self.get_section("asia_world/world")),
("Sci & Tech", self.get_section("sci_tech")),
("Culture - Style", self.get_section("culture/style")),
- ("Culture - Cooking", self.get_section("culture/cooking")),
+ # ("Culture - Cooking", self.get_section("culture/cooking")),
("Culture - Movies", self.get_section("culture/movies")),
("Culture - Manga & Anime", self.get_section("culture/manga_anime")),
("Travel", self.get_section("travel")),
diff --git a/recipes/barrons.recipe b/recipes/barrons.recipe
index 0a5d08e7f564..fd833aa2f5bb 100644
--- a/recipes/barrons.recipe
+++ b/recipes/barrons.recipe
@@ -77,10 +77,14 @@ class barrons(BasicNewsRecipe):
return br
def parse_index(self):
+ self.log(
+ '\n***\nif this recipe fails, report it on: '
+ 'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
+ )
archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y'))
issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--'))
self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']'
- self.log(self.timefmt)
+ self.description = self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--headline--')))
self.cover_url = issue.img['src'].split('?')[0]
ans = defaultdict(list)
diff --git a/recipes/business_today.recipe b/recipes/business_today.recipe
index 44c5705ba379..1ea2f2b3495c 100644
--- a/recipes/business_today.recipe
+++ b/recipes/business_today.recipe
@@ -74,7 +74,7 @@ class BT(BasicNewsRecipe):
# Insert feeds in specified order, if available
- feedSort = ['Editor\'s Note']
+ feedSort = ['Editor\'s Note', 'Editors note']
for i in feedSort:
if i in sections:
feeds.append((i, sections[i]))
@@ -82,7 +82,8 @@ class BT(BasicNewsRecipe):
# Done with the sorted feeds
for i in feedSort:
- del sections[i]
+ if i in sections:
+ del sections[i]
# Append what is left over...
diff --git a/recipes/caravan_magazine.recipe b/recipes/caravan_magazine.recipe
index 7896deda7261..c271c3a92c14 100644
--- a/recipes/caravan_magazine.recipe
+++ b/recipes/caravan_magazine.recipe
@@ -92,8 +92,8 @@ class CaravanMagazine(BasicNewsRecipe):
'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
)
api = 'https://api.caravanmagazine.in/api/trpc/magazines.getLatestIssue'
- # https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&input=
- # %7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A 2 %2C%22year%22%3A 2024 %7D%7D%7D
+ # api = 'https://api.caravanmagazine.in/api/trpc/magazines.getForMonthAndYear?batch=1&' + \
+ # 'input=%7B%220%22%3A%7B%22json%22%3A%7B%22month%22%3A' + '2' + '%2C%22year%22%3A' + '2024' + '%7D%7D%7D'
# input={"0":{"json":{"month":2,"year":2024}}}
raw = self.index_to_soup(api, raw=True)
data = json.loads(raw)['result']['data']['json']
diff --git a/recipes/eenadu.recipe b/recipes/eenadu.recipe
index cebec9e5a2c8..893dec36e064 100644
--- a/recipes/eenadu.recipe
+++ b/recipes/eenadu.recipe
@@ -1,7 +1,5 @@
-import re
-from datetime import date, datetime, timedelta
+from urllib.parse import quote
-from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe, classes
@@ -10,118 +8,116 @@ class eenadu_ts(BasicNewsRecipe):
__author__ = 'unkn0wn'
description = 'THE LARGEST CIRCULATED TELUGU DAILY'
language = 'te'
- use_embedded_content = False
- remove_javascript = True
+ encoding = 'utf-8'
no_stylesheets = True
- remove_attributes = ['height', 'width', 'style']
- ignore_duplicate_articles = {'url', 'title'}
+ remove_javascript = True
masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png'
- cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/08/GTH/5_01/d5041804_01_mr.jpg'
- encoding = 'utf-8'
+ remove_attributes = ['style', 'height', 'width']
+ ignore_duplicate_articles = {'url', 'title'}
+ reverse_article_order = True
remove_empty_feeds = True
- extra_css = '.pub-t{font-size:small; font-style:italic;}'
+ simultaneous_downloads = 1
+ art_url = ''
- keep_only_tags = [
- dict(name='h1'),
- dict(**classes('pub-t')),
- classes('fullstory text-justify contlist-cont'),
- dict(name='span', attrs={'id': 'PDSAIApbreak'}),
- ]
+ extra_css = '''
+ img {display:block; margin:0 auto;}
+ blockquote, em {color:#202020;}
+ .pub-t{font-size:small; font-style:italic;}
+ '''
- remove_tags = [
- dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}),
- dict(
- name='p',
- attrs={
- 'style':
- 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;'
- }
- ),
- dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}),
- dict(name='br'),
- classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list')
- ]
+ keep_only_tags = [classes('bookWrapper fullstory')]
+ remove_tags = [classes('ext-link offset-tb1 sshare-c')]
+
+ articles_are_obfuscated = True
+
+ def get_obfuscated_article(self, url):
+ br = self.get_browser()
+ soup = self.index_to_soup(url)
+ link = soup.a['href']
+ skip_sections =[ # add sections you want to skip
+ '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla'
+ ]
+ if any(x in link for x in skip_sections):
+ self.abort_article('skipping video links')
+ self.log('Found ', link)
+ self.art_url = link
+ html = br.open(link).read()
+ return ({ 'data': html, 'url': link })
- def parse_index(self):
- section_list = [
- ('తెలంగాణ తాజా వార్తలు', 'telangana'),
- ('సంపాదకీయం', 'telangana/editorial'),
- ('తెలంగాణ ప్రధానాంశాలు', 'telangana/top-news'),
- ('తెలంగాణ జిల్లా వార్తలు', 'telangana/districts'),
- # ('క్రైమ్', 'crime'),
+ resolve_internal_links = True
+ remove_empty_feeds = True
+
+ def get_cover_url(self):
+ import json
+ from datetime import date
+ today = quote(date.today().strftime('%d/%m/%Y'), safe='')
+ raw = self.index_to_soup(
+ 'https://epaper.eenadu.net/Home/GetAllpages?editionid=1&editiondate=' + today, raw=True
+ )
+ for cov in json.loads(raw):
+ if cov['NewsProPageTitle'].lower().startswith('front'):
+ return cov['HighResolution']
+
+ feeds = []
+
+ when = '27' # hours
+ index = 'https://www.eenadu.net'
+ a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te'
+
+ news = index + '/telugu-news/'
+ news_list = [
+ ('తెలంగాణ ప్రధానాంశాలు', 'ts-top-news'),
+ ('సంపాదకీయం', 'editorial'),
+ ('వ్యాఖ్యానం', 'vyakyanam'),
+ ('హైదరాబాద్ జిల్లా వార్తలు', 'districts/Hyderabad'),
+ ('క్రైమ్', 'crime'),
('పాలిటిక్స్', 'politics'),
('జాతీయం', 'india'),
('బిజినెస్', 'business'),
('అంతర్జాతీయం', 'world'),
('క్రీడలు', 'sports'),
- # ('సినిమా', 'movies'),
- # ('చదువు', 'education'),
- # ('సుఖీభవ', 'health'),
- # ('ఈ-నాడు', 'technology'),
- # ('మకరందం', 'devotional'),
- # ('ఈ తరం', 'youth'),
- # ('ఆహా', 'recipes'),
- # ('హాయ్ బుజ్జీ', 'kids-stories'),
- # ('స్థిరాస్తి', 'real-estate'),
+ ('సినిమా', 'movies'),
+ ('వసుంధర', 'women'),
+ ('ఈ-నాడు', 'technology'),
+ ('వెబ్ ప్రత్యేకం', 'explained')
]
- is_sunday = date.today().weekday() == 6
- if is_sunday:
- section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine'))
- feeds = []
-
- # For each section title, fetch the article urls
- for section in section_list:
- section_title = section[0]
- section_url = 'https://www.eenadu.net/' + section[1]
- self.log(section_title, section_url)
- soup = self.index_to_soup(section_url)
- articles = self.articles_from_soup(soup)
- if articles:
- feeds.append((section_title, articles))
- return feeds
-
- def articles_from_soup(self, soup):
- ans = []
- for link in soup.findAll(
- attrs={
- 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel']
- }
- ):
- for a in link.findAll('a', attrs={'href': True}):
- url = a['href']
- if url.startswith('https') is False:
- url = 'https://www.eenadu.net/' + url
-
- try:
- desc = self.tag_to_string(a.find('div')).strip()
- except Exception:
- desc = ''
-
- for h3 in a.findAll('h3'):
- title = self.tag_to_string(h3).strip()
- sub = re.escape(title)
- desc = re.sub(sub, '', desc).strip()
-
- if not title or not url:
- continue
-
- self.log('\t', title, '\n\t', desc, '\n\t\t', url)
- ans.append({'title': title, 'url': url, 'description': desc})
- return ans
-
- def preprocess_html(self, soup):
- div = soup.find('div', **classes('pub-t'))
- if div:
- date = parse_date(
- self.tag_to_string(div).strip().replace('Published : ', '').replace(
- 'Updated : ', ''
- ).replace(' IST', ':00.000001')
- ).replace(tzinfo=None)
- today = datetime.now()
- if (today - date) > timedelta(1.15):
- self.abort_article('Skipping old article')
- else:
- self.abort_article('not an article')
- for img in soup.findAll('img', attrs={'data-src': True}):
- img['src'] = img['data-src']
- return soup
+ for n in news_list:
+ news_index = news + n[1] + '/'
+ feeds.append((n[0], a.format(when, quote(news_index, safe=''))))
+ feeds.append(('Other News', a.format(when, quote(news, safe=''))))
+
+
+ art = index + '/telugu-article/'
+ art_list = [
+ ('చదువు', 'education'),
+ ('సుఖీభవ', 'health'),
+ ('ఆహా', 'recipes'),
+ ('హాయ్ బుజ్జీ', 'kids-stories'),
+ ('మకరందం', 'devotional'),
+ ('దేవతార్చన', 'temples'),
+ ('స్థిరాస్తి', 'real-estate'),
+ ('కథామృతం', 'kathalu'),
+ ('సండే మ్యాగజైన్', 'sunday-magazine')
+ ]
+ for x in art_list:
+ art_index = art + x[1] + '/'
+ feeds.append((x[0], a.format(when, quote(art_index, safe=''))))
+ feeds.append(('Other Articles', a.format(when, quote(art, safe=''))))
+
+ feeds.append(('ఇతరులు', a.format(when, quote(index, safe=''))))
+ feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/')))
+
+ def populate_article_metadata(self, article, soup, first):
+ article.url = self.art_url
+ article.title = article.title.replace(' - Eenadu', '')
+ desc = soup.find(attrs={'class':'srtdes'})
+ if desc:
+ article.summary = self.tag_to_string(desc)
+ article.text_summary = article.summary
+
+ def preprocess_raw_html(self, raw, *a):
+ import re
+ if '' in raw:
+ body = re.search(r'([^~]+?)', raw)
+ return '
' + body.group(1) + '
'
+ return raw
diff --git a/recipes/eenadu_ap.recipe b/recipes/eenadu_ap.recipe
index 4c7d8d0a2e42..9d775950420f 100644
--- a/recipes/eenadu_ap.recipe
+++ b/recipes/eenadu_ap.recipe
@@ -1,7 +1,5 @@
-import re
-from datetime import date, datetime, timedelta
+from urllib.parse import quote
-from calibre.utils.date import parse_date
from calibre.web.feeds.news import BasicNewsRecipe, classes
@@ -10,137 +8,116 @@ class eenadu_ap(BasicNewsRecipe):
__author__ = 'unkn0wn'
description = 'THE LARGEST CIRCULATED TELUGU DAILY'
language = 'te'
- use_embedded_content = False
- remove_javascript = True
+ encoding = 'utf-8'
no_stylesheets = True
- remove_attributes = ['height', 'width', 'style']
- ignore_duplicate_articles = {'url', 'title'}
+ remove_javascript = True
masthead_url = 'https://dxxd96tbpm203.cloudfront.net//img/logo.png'
- cover_url = 'https://d66zsp32hue2v.cloudfront.net/Eenadu/2022/08/03/CAN/5_01/bfff5654_01_mr.jpg'
- encoding = 'utf-8'
+ remove_attributes = ['style', 'height', 'width']
+ ignore_duplicate_articles = {'url', 'title'}
+ reverse_article_order = True
remove_empty_feeds = True
- extra_css = '.pub-t{font-size:small; font-style:italic;}'
+ simultaneous_downloads = 1
+ art_url = ''
- keep_only_tags = [
- dict(name='h1'),
- dict(**classes('pub-t')),
- classes('fullstory text-justify contlist-cont'),
- dict(name='span', attrs={'id': 'PDSAIApbreak'}),
- ]
+ extra_css = '''
+ img {display:block; margin:0 auto;}
+ blockquote, em {color:#202020;}
+ .pub-t{font-size:small; font-style:italic;}
+ '''
- remove_tags = [
- dict(name='span', attrs={'style': 'float:left; margin-right:10px;'}),
- dict(
- name='p',
- attrs={
- 'style':
- 'font-size: 18px !important; margin: 0px; margin-top: -15px; text-align: center;flex: 1;'
- }
- ),
- dict(name='aside', attrs={'class': lambda x: x and x.startswith('thumb')}),
- dict(name='br'),
- classes('sshare-c tags andbeyond_ad fnt20 arti more2 offset-tb1 msb-list')
- ]
+ keep_only_tags = [classes('bookWrapper fullstory')]
+ remove_tags = [classes('ext-link offset-tb1 sshare-c')]
+
+ articles_are_obfuscated = True
+
+ def get_obfuscated_article(self, url):
+ br = self.get_browser()
+ soup = self.index_to_soup(url)
+ link = soup.a['href']
+ skip_sections =[ # add sections you want to skip
+ '/video/', '/videos/', '/multimedia/', 'marathi', 'hindi', 'bangla'
+ ]
+ if any(x in link for x in skip_sections):
+ self.abort_article('skipping video links')
+ self.log('Found ', link)
+ self.art_url = link
+ html = br.open(link).read()
+ return ({ 'data': html, 'url': link })
+
+ resolve_internal_links = True
+ remove_empty_feeds = True
def get_cover_url(self):
+ import json
from datetime import date
- cover = 'https://img.kiosko.net/' + str(
- date.today().year
- ) + '/' + date.today().strftime('%m') + '/' + date.today(
- ).strftime('%d') + '/in/eenadu.750.jpg'
- br = BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)
- try:
- br.open(cover)
- except:
- index = 'https://es.kiosko.net/in/np/eenadu.html'
- soup = self.index_to_soup(index)
- for image in soup.findAll('img', src=True):
- if image['src'].endswith('750.jpg'):
- return 'https:' + image['src']
- self.log("\nCover unavailable")
- cover = None
- return cover
-
- def parse_index(self):
- section_list = [
- ('ఆంధ్రప్రదేశ్ తాజా వార్తలు', 'andhra-pradesh'),
- ('సంపాదకీయం', 'andhra-pradesh/editorial'),
- ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'andhra-pradesh/top-news'),
- ('ఆంధ్రప్రదేశ్ జిల్లా వార్తలు', 'andhra-pradesh/districts'),
- # ('క్రైమ్', 'crime'),
+ today = quote(date.today().strftime('%d/%m/%Y'), safe='')
+ raw = self.index_to_soup(
+ 'https://epaper.eenadu.net/Home/GetAllpages?editionid=2&editiondate=' + today, raw=True
+ )
+ for cov in json.loads(raw):
+ if cov['NewsProPageTitle'].lower().startswith('front'):
+ return cov['HighResolution']
+
+ feeds = []
+
+ when = '27' # hours
+ index = 'https://www.eenadu.net'
+ a = 'https://news.google.com/rss/search?q=when:{}h+allinurl:{}&hl=te-IN&gl=IN&ceid=IN:te'
+
+ news = index + '/telugu-news/'
+ news_list = [
+ ('ఆంధ్రప్రదేశ్ ప్రధానాంశాలు', 'ap-top-news'),
+ ('సంపాదకీయం', 'editorial'),
+ ('వ్యాఖ్యానం', 'vyakyanam'),
+ ('విశాఖపట్నం జిల్లా వార్తలు', 'districts/Visakhapatnam'),
+ ('క్రైమ్', 'crime'),
('పాలిటిక్స్', 'politics'),
('జాతీయం', 'india'),
('బిజినెస్', 'business'),
('అంతర్జాతీయం', 'world'),
('క్రీడలు', 'sports'),
- # ('సినిమా', 'movies'),
- # ('చదువు', 'education'),
- # ('సుఖీభవ', 'health'),
- # ('ఈ-నాడు', 'technology'),
- # ('మకరందం', 'devotional'),
- # ('ఈ తరం', 'youth'),
- # ('ఆహా', 'recipes'),
- # ('హాయ్ బుజ్జీ', 'kids-stories'),
- # ('స్థిరాస్తి', 'real-estate'),
+ ('సినిమా', 'movies'),
+ ('వసుంధర', 'women'),
+ ('ఈ-నాడు', 'technology'),
+ ('వెబ్ ప్రత్యేకం', 'explained')
]
- is_sunday = date.today().weekday() == 6
- if is_sunday:
- section_list.append(('సండే మ్యాగజైన్', 'sunday-magazine'))
- feeds = []
-
- # For each section title, fetch the article urls
- for section in section_list:
- section_title = section[0]
- section_url = 'https://www.eenadu.net/' + section[1]
- self.log(section_title, section_url)
- soup = self.index_to_soup(section_url)
- articles = self.articles_from_soup(soup)
- if articles:
- feeds.append((section_title, articles))
- return feeds
-
- def articles_from_soup(self, soup):
- ans = []
- for link in soup.findAll(
- attrs={
- 'class': ['telugu_uni_body', 'thumb-content-more', 'owl-carousel']
- }
- ):
- for a in link.findAll('a', attrs={'href': True}):
- url = a['href']
- if url.startswith('https') is False:
- url = 'https://www.eenadu.net/' + url
-
- try:
- desc = self.tag_to_string(a.find('div')).strip()
- except Exception:
- desc = ''
-
- for h3 in a.findAll('h3'):
- title = self.tag_to_string(h3).strip()
- sub = re.escape(title)
- desc = re.sub(sub, '', desc).strip()
-
- if not title or not url:
- continue
-
- self.log('\t', title, '\n\t', desc, '\n\t\t', url)
- ans.append({'title': title, 'url': url, 'description': desc})
- return ans
-
- def preprocess_html(self, soup):
- div = soup.find('div', **classes('pub-t'))
- if div:
- date = parse_date(
- self.tag_to_string(div).strip().replace('Published : ', '').replace(
- 'Updated : ', ''
- ).replace(' IST', ':00.000001')
- ).replace(tzinfo=None)
- today = datetime.now()
- if (today - date) > timedelta(1.15):
- self.abort_article('Skipping old article')
- else:
- self.abort_article('not an article')
- for img in soup.findAll('img', attrs={'data-src': True}):
- img['src'] = img['data-src']
- return soup
+ for n in news_list:
+ news_index = news + n[1] + '/'
+ feeds.append((n[0], a.format(when, quote(news_index, safe=''))))
+ feeds.append(('Other News', a.format(when, quote(news, safe=''))))
+
+
+ art = index + '/telugu-article/'
+ art_list = [
+ ('చదువు', 'education'),
+ ('సుఖీభవ', 'health'),
+ ('ఆహా', 'recipes'),
+ ('హాయ్ బుజ్జీ', 'kids-stories'),
+ ('మకరందం', 'devotional'),
+ ('దేవతార్చన', 'temples'),
+ ('స్థిరాస్తి', 'real-estate'),
+ ('కథామృతం', 'kathalu'),
+ ('సండే మ్యాగజైన్', 'sunday-magazine')
+ ]
+ for x in art_list:
+ art_index = art + x[1] + '/'
+ feeds.append((x[0], a.format(when, quote(art_index, safe=''))))
+ feeds.append(('Other Articles', a.format(when, quote(art, safe=''))))
+
+ feeds.append(('ఇతరులు', a.format(when, quote(index, safe=''))))
+ feeds.append(('ప్రతిభ', a.format(when, 'https://pratibha.eenadu.net/')))
+
+ def populate_article_metadata(self, article, soup, first):
+ article.url = self.art_url
+ article.title = article.title.replace(' - Eenadu', '')
+ desc = soup.find(attrs={'class':'srtdes'})
+ if desc:
+ article.summary = self.tag_to_string(desc)
+ article.text_summary = article.summary
+
+ def preprocess_raw_html(self, raw, *a):
+ import re
+ if '' in raw:
+ body = re.search(r'([^~]+?)', raw)
+ return '' + body.group(1) + '
'
+ return raw
diff --git a/recipes/el_correo.recipe b/recipes/el_correo.recipe
index 7fa3f54b9f8e..4f03835d4684 100644
--- a/recipes/el_correo.recipe
+++ b/recipes/el_correo.recipe
@@ -1,184 +1,114 @@
-#!/usr/bin/env python
-__license__ = 'GPL v3'
-__copyright__ = '08 Januery 2011, desUBIKado'
-__author__ = 'desUBIKado'
-__description__ = 'Daily newspaper from Biscay'
-__version__ = 'v0.14'
-__date__ = '10, September 2017'
'''
http://www.elcorreo.com/
'''
-import re
-import time
-
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, classes
class elcorreo(BasicNewsRecipe):
- __author__ = 'desUBIKado'
- description = 'Daily newspaper from Biscay'
- title = u'El Correo'
- publisher = 'Vocento'
- category = 'News, politics, culture, economy, general interest'
- oldest_article = 1
- delay = 1
- max_articles_per_feed = 100
- no_stylesheets = True
- use_embedded_content = False
- masthead_url = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
+ title = 'El Correo'
+ __author__ = 'unkn0wn'
+ description = 'Daily newspaper in Bilbao and the Basque Country of northern Spain'
+ oldest_article = 1 # days
language = 'es'
- timefmt = '[%a, %d %b, %Y]'
+ no_stylesheets = True
+ remove_attributes = ['height', 'width', 'style']
+ ignore_duplicate_articles = {'url'}
+ encoding = 'utf-8'
+ masthead_url = 'https://s1.ppllstatics.com/starfish/1.3.76/assets/images/logos/logo-elcorreo.svg'
encoding = 'utf-8'
remove_empty_feeds = True
- remove_javascript = True
+ resolve_internal_links = True
- feeds = [
- (u'Portada', u'http://www.elcorreo.com/rss/atom/portada'),
- (u'Mundo', u'http://www.elcorreo.com/rss/atom/?section=internacional'),
- (u'Bizkaia', u'http://www.elcorreo.com/rss/atom/?section=bizkaia'),
- (u'Guipuzkoa', u'http://www.elcorreo.com/rss/atom/?section=gipuzkoa'),
- (u'Araba', u'http://www.elcorreo.com/rss/atom/?section=araba'),
- (u'La Rioja', u'http://www.elcorreo.com/rss/atom/?section=larioja'),
- (u'Miranda', u'http://www.elcorreo.com/rss/atom/?section=miranda'),
- (u'Economía', u'http://www.elcorreo.com/rss/atom/?section=economia'),
- (u'Culturas', u'http://www.elcorreo.com/rss/atom/?section=culturas'),
- (u'Politica', u'http://www.elcorreo.com/rss/atom/?section=politica'),
- (u'Tecnología', u'http://www.elcorreo.com/rss/atom/?section=tecnologia'),
- (u'Gente - Estilo', u'http://www.elcorreo.com/rss/atom/?section=gente-estilo'),
- (u'Planes', u'http://www.elcorreo.com/rss/atom/?section=planes'),
- (u'Athletic', u'http://www.elcorreo.com/rss/atom/?section=athletic'),
- (u'Alavés', u'http://www.elcorreo.com/rss/atom/?section=alaves'),
- (u'Bilbao Basket', u'http://www.elcorreo.com/rss/atom/?section=bilbaobasket'),
- (u'Baskonia', u'http://www.elcorreo.com/rss/atom/?section=baskonia'),
- (u'Deportes', u'http://www.elcorreo.com/rss/atom/?section=deportes'),
- (u'Jaiak', u'http://www.elcorreo.com/rss/atom/?section=jaiak'),
- (u'La Blanca', u'http://www.elcorreo.com/rss/atom/?section=la-blanca-vitoria'),
- (u'Aste Nagusia', u'http://www.elcorreo.com/rss/atom/?section=aste-nagusia-bilbao'),
- (u'Semana Santa', u'http://www.elcorreo.com/rss/atom/?section=semana-santa'),
- (u'Festivales', u'http://www.elcorreo.com/rss/atom/?section=festivales')
- ]
+ extra_css = '''
+ .v-mdl-ath__inf, .v-mdl-ath__p--2, .v-mdl-ath__p {font-size:small; color:#404040;}
+ .v-fc, .v-a-fig { text-align:center; font-size:small; }
+ #sub { font-style:italic; color:#202020; }
+ blockquote, em { color:#202020; }
+ img { display:block; margin:0 auto; }
+ '''
+
+ def get_cover_url(self):
+ from datetime import date
+ return 'https://portada.iperiodico.es/' + date.today().strftime('%Y/%m/%d') + '_elcorreo.750.jpg'
keep_only_tags = [
- dict(name='div', attrs={'class': ['col-xs-12 col-sm-12 col-md-8 col-lg-8']})
+ dict(name='article', attrs={'class': lambda x: x and set(x.split()).intersection(
+ {'v-a--d-bs', 'v-a--d-opn', 'v-a--d-rpg'})}),
+ classes(
+ 'v-d--ab-c v-d--rpg'
+ )
]
remove_tags = [
- dict(
- name='div',
- attrs={
- 'class': [
- 'voc-topics voc-detail-grid ', 'voc-newsletter ',
- 'voc-author-social'
- ]
- }
- ),
- dict(name='section', attrs={'class': ['voc-ficha-detail voc-file-sports']})
+ dict(name = ['svg', 'section', 'nav']),
+ dict(attrs={'data-voc-component':['dropdown', 'modal', 'slider-grab']}),
+ classes(
+ 'v-mdl-ath__img-c v-adv v-i-b v-mdl-ath__c--2 v-d-cmp-adv v-d-cmp-nws '
+ 'v-pill-m--zoom v-stk-adv slider-grab g-artboard v-d-cmp-rld v-pill-m--glly'
+ )
]
- remove_tags_before = dict(
- name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-8 col-lg-8'}
- )
- remove_tags_after = dict(
- name='div', attrs={'class': 'col-xs-12 col-sm-12 col-md-8 col-lg-8'}
- )
-
- _processed_links = []
-
- def get_article_url(self, article):
- link = article.get('link', None)
-
- if link is None:
- return article
-
- # modificamos la url de las noticias de los equipos deportivos para que funcionen, por ejemplo:
- # http://athletic.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html
- # http://m.elcorreo.com/noticias/201407/27/muniain-estrella-athletic-para-20140727093046.html?external=deportes/athletic
-
- parte = link.split('/')
-
- if parte[2] == 'athletic.elcorreo.com':
- link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[
- 4
- ] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/athletic'
- else:
- if parte[2] == 'baskonia.elcorreo.com':
- link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[
- 4
- ] + '/' + parte[5] + '/' + parte[6] + '?external=deportes/baskonia'
+ def preprocess_html(self, soup):
+ art = soup.find('article')
+ h1 = soup.find('h1')
+ h2 = soup.find('h2')
+ if h1 and art:
+ art.insert_before(h1)
+ if h2 and h1:
+ h1.insert_after(h2)
+ h2.name = 'p'
+ h2['id'] = 'sub'
+ for but in soup.findAll('button'):
+ if but.find('img'):
+ but.name = 'div'
else:
- if parte[2] == 'bilbaobasket.elcorreo.com':
- link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[
- 4
- ] + '/' + parte[5] + '/' + parte[
- 6
- ] + '?external=deportes/bilbaobasket'
- else:
- if parte[2] == 'alaves.elcorreo.com':
- link = 'http://www.elcorreo.com/' + parte[3] + '/' + parte[
- 4
- ] + '/' + parte[5] + '/' + parte[
- 6
- ] + '?external=deportes/alaves'
-
- # A veces el mismo articulo aparece en la versión de Alava y en la de Bizkaia. Por ejemplo:
- # http://www.elcorreo.com/alava/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html
- # http://www.elcorreo.com/bizkaia/deportes/motor/formula-1/201407/27/ecclestone-quiere-briatore-ayude-20140727140820-rc.html
- # para controlar los duplicados, unificamos las url para que sean siempre de bizkaia (excepto para la sección "araba")
-
- if ((parte[3] == 'alava') and (parte[4] != 'araba')):
- link = link.replace('elcorreo.com/alava', 'elcorreo.com/bizkaia')
-
- # Controlamos si el artículo ha sido incluido en otro feed para eliminarlo
-
- if link not in self._processed_links:
- self._processed_links.append(link)
- else:
- link = None
-
- return link
-
- # Recuperamos la portada de papel (la imagen format=1 tiene mayor resolucion)
-
- def get_cover_url(self):
- cover = None
- st = time.localtime()
- year = str(st.tm_year)
- month = "%.2d" % st.tm_mon
- day = "%.2d" % st.tm_mday
- # http://info.elcorreo.com/pdf/07082013-viz.pdf
- cover = 'http://info.elcorreo.com/pdf/' + day + month + year + '-viz.pdf'
- br = BasicNewsRecipe.get_browser(self)
- try:
- br.open(cover)
- except:
- self.log("\nPortada no disponible")
- cover = 'http://www.elcorreo.com/vizcaya/noticias/201002/02/Media/logo-elcorreo-nuevo.png'
- return cover
-
- # Para cambiar el estilo del texto
-
- extra_css = '''
- h1 {font-family:Arial,Helvetica,sans-serif; font-weight:bold;font-size:28px;}
- h2 {font-family:georgia,serif; font-style:italic; font-weight:normal;font-size:16px;color:#4D4D4D;}
- h3 {font-family:georgia,serif; font-weight:bold;font-size:18px;}
- '''
-
- preprocess_regexps = [
-
- # Para presentar la imagen de los video incrustados
- (
- re.compile(r'stillURLVideo: \'', re.DOTALL | re.IGNORECASE),
- lambda match: '