From 21cdcf03a237a0c4979c941d5a5385cae44c7906 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 26 Oct 2024 18:02:21 +0000 Subject: [PATCH 01/30] [ie] Resolve `language` to ISO639-2 for ISM formats (#11359) Closes #11356 Authored by: bashonly --- yt_dlp/extractor/common.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 795105b7d878..ecece85f5fb9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -47,6 +47,7 @@ FormatSorter, GeoRestrictedError, GeoUtils, + ISO639Utils, LenientJSONDecoder, Popen, RegexNotFoundError, @@ -3071,7 +3072,11 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): url_pattern = stream.attrib['Url'] stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') - stream_language = stream.get('Language', 'und') + # IsmFD expects ISO 639 Set 2 language codes (3-character length) + # See: https://github.com/yt-dlp/yt-dlp/issues/11356 + stream_language = stream.get('Language') or 'und' + if len(stream_language) != 3: + stream_language = ISO639Utils.short2long(stream_language) or 'und' for track in stream.findall('QualityLevel'): KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) From 5c880ef42e9c2b2fc412f6d69dad37d34fb75a62 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 27 Oct 2024 00:17:26 +0200 Subject: [PATCH 02/30] [core] Populate format sorting fields before dependent fields (#11353) Authored by: Grub4K --- yt_dlp/YoutubeDL.py | 5 +---- yt_dlp/utils/_utils.py | 7 +++++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 48185b769316..f08a31afac77 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2849,13 +2849,10 @@ def is_wellformed(f): sanitize_string_field(fmt, 'format_id') sanitize_numeric_fields(fmt) fmt['url'] = sanitize_url(fmt['url']) - if fmt.get('ext') is None: - fmt['ext'] = determine_ext(fmt['url']).lower() + FormatSorter._fill_sorting_fields(fmt) if fmt['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'): if fmt.get('acodec') is None: fmt['acodec'] = fmt['ext'] - if fmt.get('protocol') is None: - fmt['protocol'] = determine_protocol(fmt) if fmt.get('resolution') is None: fmt['resolution'] = self.format_resolution(fmt, default=None) if fmt.get('dynamic_range') is None and fmt.get('vcodec') != 'none': diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 7aff67ddfc25..b7de04e63858 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5578,14 +5578,15 @@ def _calculate_field_preference(self, format_, field): value = get_value(field) return self._calculate_field_preference_from_value(format_, field, type_, value) - def calculate_preference(self, format): + @staticmethod + def _fill_sorting_fields(format): # Determine missing protocol if not format.get('protocol'): format['protocol'] = determine_protocol(format) # Determine missing ext if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) + format['ext'] = determine_ext(format['url']).lower() if format.get('vcodec') == 'none': format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' format['video_ext'] = 'none' @@ -5613,6 +5614,8 @@ def calculate_preference(self, format): if not format.get('tbr'): format['tbr'] = try_call(lambda: format['vbr'] + format['abr']) or None + def calculate_preference(self, format): + self._fill_sorting_fields(format) return tuple(self._calculate_field_preference(format, field) for field in self._order) From 0a3991edae0e10f2ea41ece9fdea5e48f789f1de Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 27 Oct 2024 23:00:02 +0000 Subject: [PATCH 03/30] [devscripts] `make_changelog`: Parse full commit message for fixes (#11366) Authored by: Grub4K, bashonly Co-authored-by: Simon Sawicki --- devscripts/changelog_override.json | 18 ++++++++++++ devscripts/make_changelog.py | 44 ++++++++++++++---------------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index e5d6958fca93..08ea9666ed20 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -216,5 +216,23 @@ "action": "add", "when": "d784464399b600ba9516bbcec6286f11d68974dd", "short": "[priority] **The minimum *required* Python version has been raised to 3.9**\nPython 3.8 reached its end-of-life on 2024.10.07, and yt-dlp has now removed support for it. As an unfortunate side effect, the official `yt-dlp.exe` and `yt-dlp_x86.exe` binaries are no longer supported on Windows 7. [Read more](https://github.com/yt-dlp/yt-dlp/issues/10086)" + }, + { + "action": "change", + "when": "914af9a0cf51c9a3f74aa88d952bee8334c67511", + "short": "Expand paths in `--plugin-dirs` (#11334)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "c29f5a7fae93a08f3cfbb6127b2faa75145b06a0", + "short": "[ie/generic] Do not impersonate by default (#11336)", + "authors": ["bashonly"] + }, + { + "action": "change", + "when": "57212a5f97ce367590aaa5c3e9a135eead8f81f7", + "short": "[ie/vimeo] Fix API retries (#11351)", + "authors": ["bashonly"] } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 00634fb9116d..7c876101b49d 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -71,14 +71,13 @@ def group_lookup(cls): def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: group, _, subgroup = (group.strip().lower() for group in value.partition('/')) - result = cls.group_lookup().get(group) - if not result: - if subgroup: - return None, value - subgroup = group - result = cls.subgroup_lookup().get(subgroup) + if result := cls.group_lookup().get(group): + return result, subgroup or None - return result, subgroup or None + if subgroup: + return None, value + + return cls.subgroup_lookup().get(group), group or None @dataclass @@ -136,8 +135,7 @@ def _format_groups(self, groups): first = False yield '\n

Changelog

\n' - group = groups[item] - if group: + if group := groups[item]: yield self.format_module(item.value, group) if self._collapsible: @@ -253,7 +251,7 @@ class CommitRange: ''', re.VERBOSE | re.DOTALL) EXTRACTOR_INDICATOR_RE = re.compile(r'(?:Fix|Add)\s+Extractors?', re.IGNORECASE) REVERT_RE = re.compile(r'(?:\[[^\]]+\]\s+)?(?i:Revert)\s+([\da-f]{40})') - FIXES_RE = re.compile(r'(?i:Fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Revert|Improve)\s+([\da-f]{40})') + FIXES_RE = re.compile(r'(?i:(?:bug\s*)?fix(?:es)?(?:\s+bugs?)?(?:\s+in|\s+for)?|Improve)\s+([\da-f]{40})') UPSTREAM_MERGE_RE = re.compile(r'Update to ytdl-commit-([\da-f]+)') def __init__(self, start, end, default_author=None): @@ -287,11 +285,16 @@ def _get_commits_and_fixes(self, default_author): short = next(lines) skip = short.startswith('Release ') or short == '[version] update' + fix_commitish = None + if match := self.FIXES_RE.search(short): + fix_commitish = match.group(1) + authors = [default_author] if default_author else [] for line in iter(lambda: next(lines), self.COMMIT_SEPARATOR): - match = self.AUTHOR_INDICATOR_RE.match(line) - if match: + if match := self.AUTHOR_INDICATOR_RE.match(line): authors = sorted(map(str.strip, line[match.end():].split(',')), key=str.casefold) + if not fix_commitish and (match := self.FIXES_RE.fullmatch(line)): + fix_commitish = match.group(1) commit = Commit(commit_hash, short, authors) if skip and (self._start or not i): @@ -301,21 +304,17 @@ def _get_commits_and_fixes(self, default_author): logger.debug(f'Reached Release commit, breaking: {commit}') break - revert_match = self.REVERT_RE.fullmatch(commit.short) - if revert_match: - reverts[revert_match.group(1)] = commit + if match := self.REVERT_RE.fullmatch(commit.short): + reverts[match.group(1)] = commit continue - fix_match = self.FIXES_RE.search(commit.short) - if fix_match: - commitish = fix_match.group(1) - fixes[commitish].append(commit) + if fix_commitish: + fixes[fix_commitish].append(commit) commits[commit.hash] = commit for commitish, revert_commit in reverts.items(): - reverted = commits.pop(commitish, None) - if reverted: + if reverted := commits.pop(commitish, None): logger.debug(f'{commitish} fully reverted {reverted}') else: commits[revert_commit.hash] = revert_commit @@ -461,8 +460,7 @@ def create_changelog(args): logger.info(f'Loaded {len(commits)} commits') - new_contributors = get_new_contributors(args.contributors_path, commits) - if new_contributors: + if new_contributors := get_new_contributors(args.contributors_path, commits): if args.contributors: write_file(args.contributors_path, '\n'.join(new_contributors) + '\n', mode='a') logger.info(f'New contributors: {", ".join(new_contributors)}') From 330335386d4f7603d92d6796798375336005275e Mon Sep 17 00:00:00 2001 From: JAB Date: Mon, 28 Oct 2024 00:18:25 +0100 Subject: [PATCH 04/30] [ie/ccma] Support new 3cat.cat domain (#11222) Closes #11163 Authored by: JoseAngelB --- yt_dlp/extractor/ccma.py | 61 +++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/ccma.py b/yt_dlp/extractor/ccma.py index ffe4b49c15d9..7014c208d43a 100644 --- a/yt_dlp/extractor/ccma.py +++ b/yt_dlp/extractor/ccma.py @@ -12,53 +12,86 @@ class CCMAIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ccma\.cat/(?:[^/]+/)*?(?Pvideo|audio)/(?P\d+)' + IE_DESC = '3Cat, TV3 and Catalunya Ràdio' + _VALID_URL = r'https?://(?:www\.)?3cat\.cat/(?:3cat|tv3/sx3)/[^/?#]+/(?Pvideo|audio)/(?P\d+)' _TESTS = [{ - 'url': 'http://www.ccma.cat/tv3/alacarta/lespot-de-la-marato-de-tv3/lespot-de-la-marato-de-tv3/video/5630208/', + # ccma.cat/tv3/alacarta/ URLs redirect to 3cat.cat/3cat/ + 'url': 'https://www.3cat.cat/3cat/lespot-de-la-marato-de-tv3/video/5630208/', 'md5': '7296ca43977c8ea4469e719c609b0871', 'info_dict': { 'id': '5630208', 'ext': 'mp4', - 'title': 'L\'espot de La Marató de TV3', + 'title': 'L\'espot de La Marató 2016: Ictus i les lesions medul·lars i cerebrals traumàtiques', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', 'timestamp': 1478608140, 'upload_date': '20161108', 'age_limit': 0, + 'alt_title': 'EsportMarató2016WEB_PerPublicar', + 'duration': 79, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/4/6/1478536106664.jpg', + 'series': 'Dedicada a l\'ictus i les lesions medul·lars i cerebrals traumàtiques', + 'categories': ['Divulgació'], }, }, { - 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', + # ccma.cat/catradio/alacarta/ URLs redirect to 3cat.cat/3cat/ + 'url': 'https://www.3cat.cat/3cat/el-consell-de-savis-analitza-el-derbi/audio/943685/', 'md5': 'fa3e38f269329a278271276330261425', 'info_dict': { 'id': '943685', 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20170512', - 'timestamp': 1494622500, + 'upload_date': '20161217', + 'timestamp': 1482011700, 'vcodec': 'none', 'categories': ['Esports'], + 'series': 'Tot gira', + 'duration': 821, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/8/9/1482002602598.jpg', }, }, { - 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', - 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'url': 'https://www.3cat.cat/3cat/crims-josep-tallada-lespereu-me-part-1/video/6031387/', + 'md5': '27493513d08a3e5605814aee9bb778d2', 'info_dict': { 'id': '6031387', 'ext': 'mp4', - 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'title': 'T1xC5 - Josep Talleda, l\'"Espereu-me" (part 1)', 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', - 'timestamp': 1582577700, + 'timestamp': 1582577919, 'upload_date': '20200224', - 'subtitles': 'mincount:4', - 'age_limit': 16, + 'subtitles': 'mincount:1', + 'age_limit': 13, 'series': 'Crims', + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/1/9/1582564376991.jpg', + 'duration': 3203, + 'categories': ['Divulgació'], + 'alt_title': 'Crims - 5 - Josep Talleda, l\'"Espereu-me" (1a part) - Josep Talleda, l\'"Espereu-me" (part 1)', + 'episode_number': 5, + 'episode': 'Episode 5', + }, + }, { + 'url': 'https://www.3cat.cat/tv3/sx3/una-mosca-volava-per-la-llum/video/5759227/', + 'info_dict': { + 'id': '5759227', + 'ext': 'mp4', + 'title': 'Una mosca volava per la llum', + 'alt_title': '17Z004Ç UNA MOSCA VOLAVA PER LA LLUM', + 'description': 'md5:9ab64276944b0825336f4147f13f7854', + 'series': 'Mic', + 'upload_date': '20180411', + 'timestamp': 1523440105, + 'duration': 160, + 'age_limit': 0, + 'thumbnail': 'https://img.3cat.cat/multimedia/jpg/6/1/1524071667216.jpg', + 'categories': ['Música'], }, }] def _real_extract(self, url): - media_type, media_id = self._match_valid_url(url).groups() + media_type, media_id = self._match_valid_url(url).group('type', 'id') media = self._download_json( - 'http://dinamics.ccma.cat/pvideo/media.jsp', media_id, query={ + 'http://api-media.3cat.cat/pvideo/media.jsp', media_id, query={ 'media': media_type, 'idint': media_id, 'format': 'dm', From f101e5d34c97c608156ad5396714c2a2edca966a Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 28 Oct 2024 12:08:46 +0100 Subject: [PATCH 05/30] [ie/Soundcloud] Extract artists (#11377) Closes #11375 Authored by: seproDev --- yt_dlp/extractor/soundcloud.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 4f8d96407d0a..f4beab75b7df 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -208,7 +208,6 @@ def sign(self, user, pw, clid): def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False): track_id = str(info['id']) - title = info['title'] format_urls = set() formats = [] @@ -367,7 +366,7 @@ def extract_count(key): 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), 'uploader_url': user.get('permalink_url'), 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, + 'title': info.get('title'), 'description': info.get('description'), 'thumbnails': thumbnails, 'duration': float_or_none(info.get('duration'), 1000), @@ -377,7 +376,8 @@ def extract_count(key): 'like_count': extract_count('favoritings') or extract_count('likes'), 'comment_count': extract_count('comment'), 'repost_count': extract_count('reposts'), - 'genres': traverse_obj(info, ('genre', {str}, {lambda x: x or None}, all)), + 'genres': traverse_obj(info, ('genre', {str}, filter, all, filter)), + 'artists': traverse_obj(info, ('publisher_metadata', 'artist', {str}, filter, all, filter)), 'formats': formats if not extract_flat else None, } @@ -429,7 +429,6 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'thumbnail': 'https://i1.sndcdn.com/artworks-000031955188-rwb18x-original.jpg', 'uploader_url': 'https://soundcloud.com/ethmusic', - 'genres': [], }, }, # geo-restricted @@ -453,6 +452,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'uploader_url': 'https://soundcloud.com/the-concept-band', 'thumbnail': 'https://i1.sndcdn.com/artworks-v8bFHhXm7Au6-0-original.jpg', 'genres': ['Alternative'], + 'artists': ['The Royal Concept'], }, }, # private link @@ -525,6 +525,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'repost_count': int, 'view_count': int, 'genres': ['Dance & EDM'], + 'artists': ['80M'], }, }, # private link, downloadable format @@ -549,6 +550,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'thumbnail': 'https://i1.sndcdn.com/artworks-000240712245-kedn4p-original.jpg', 'uploader_url': 'https://soundcloud.com/oriuplift', 'genres': ['Trance'], + 'artists': ['Ori Uplift'], }, }, # no album art, use avatar pic for thumbnail @@ -572,7 +574,7 @@ class SoundcloudIE(SoundcloudBaseIE): 'comment_count': int, 'repost_count': int, 'uploader_url': 'https://soundcloud.com/garyvee', - 'genres': [], + 'artists': ['MadReal'], }, 'params': { 'skip_download': True, From f93c16395cea1fe9ffc3c594d3e019c3b214544c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 29 Oct 2024 23:24:17 +0000 Subject: [PATCH 06/30] [utils] Fix `find_element` by class (#11402) Fix d710a6ca7c622705c0c8c8a3615916f531137d5d Authored by: bashonly --- yt_dlp/utils/traversal.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index df3ff406f57d..0eef817eaace 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -391,14 +391,13 @@ def find_element(*, tag: str, html=False): ... def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): # deliberately using `id=` and `cls=` for ease of readability assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' - if not tag: - tag = r'[\w:.-]+' + ANY_TAG = r'[\w:.-]+' if attr and value: assert not cls, 'Cannot match both attr and cls' assert not id, 'Cannot match both attr and id' func = get_element_html_by_attribute if html else get_element_by_attribute - return functools.partial(func, attr, value, tag=tag) + return functools.partial(func, attr, value, tag=tag or ANY_TAG) elif cls: assert not id, 'Cannot match both cls and id' @@ -408,7 +407,7 @@ def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=Fal elif id: func = get_element_html_by_id if html else get_element_by_id - return functools.partial(func, id, tag=tag) + return functools.partial(func, id, tag=tag or ANY_TAG) index = int(bool(html)) return lambda html: get_element_text_and_html_by_tag(tag, html)[index] From 5bc5fb2835ea59bdf326bd12176d74d2c7348a95 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 29 Oct 2024 23:25:46 +0000 Subject: [PATCH 07/30] Allow thumbnails with `.jpe` extension (#11408) Fix 5ce582448ececb8d9c30c8c31f58330090ced03a Closes #11407 Authored by: bashonly --- yt_dlp/utils/_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b7de04e63858..8535d28307b3 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5165,6 +5165,7 @@ class _UnsafeExtensionError(Exception): 'ico', 'image', 'jng', + 'jpe', 'jpeg', 'jxl', 'svg', From 88402b714ec124633933737bc156b172a3dec3d6 Mon Sep 17 00:00:00 2001 From: bashonly Date: Wed, 30 Oct 2024 13:40:07 -0500 Subject: [PATCH 08/30] Fix `--netrc` empty string parsing for Python <=3.10 (#11414) Ref: https://github.com/python/cpython/commit/15409c720be0503131713e3d3abc1acd0da07378 Closes #11413 Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- test/test_InfoExtractor.py | 12 ++++++++++++ test/testdata/netrc/netrc | 4 ++++ test/testdata/netrc/print_netrc.py | 2 ++ yt_dlp/extractor/common.py | 7 +++++++ 4 files changed, 25 insertions(+) create mode 100644 test/testdata/netrc/netrc create mode 100644 test/testdata/netrc/print_netrc.py diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 31e8f82448d4..54f35ef55221 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -53,6 +53,18 @@ def setUp(self): def test_ie_key(self): self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) + def test_get_netrc_login_info(self): + for params in [ + {'usenetrc': True, 'netrc_location': './test/testdata/netrc/netrc'}, + {'netrc_cmd': f'{sys.executable} ./test/testdata/netrc/print_netrc.py'}, + ]: + ie = DummyIE(FakeYDL(params)) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='normal_use'), ('user', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_user'), ('', 'pass')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='empty_pass'), ('user', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='both_empty'), ('', '')) + self.assertEqual(ie._get_netrc_login_info(netrc_machine='nonexistent'), (None, None)) + def test_html_search_regex(self): html = '

Watch this video

' search = lambda re, *args: self.ie._html_search_regex(re, html, *args) diff --git a/test/testdata/netrc/netrc b/test/testdata/netrc/netrc new file mode 100644 index 000000000000..bafe92fe6a1e --- /dev/null +++ b/test/testdata/netrc/netrc @@ -0,0 +1,4 @@ +machine normal_use login user password pass +machine empty_user login "" password pass +machine empty_pass login user password "" +machine both_empty login "" password "" diff --git a/test/testdata/netrc/print_netrc.py b/test/testdata/netrc/print_netrc.py new file mode 100644 index 000000000000..5c25814f8496 --- /dev/null +++ b/test/testdata/netrc/print_netrc.py @@ -0,0 +1,2 @@ +with open('./test/testdata/netrc/netrc', encoding='utf-8') as fp: + print(fp.read()) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ecece85f5fb9..7e6e6227d339 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1409,6 +1409,13 @@ def _get_netrc_login_info(self, netrc_machine=None): return None, None self.write_debug(f'Using netrc for {netrc_machine} authentication') + + # compat: <=py3.10: netrc cannot parse tokens as empty strings, will return `""` instead + # Ref: https://github.com/yt-dlp/yt-dlp/issues/11413 + # https://github.com/python/cpython/commit/15409c720be0503131713e3d3abc1acd0da07378 + if sys.version_info < (3, 11): + return tuple(x if x != '""' else '' for x in info[::2]) + return info[0], info[2] def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None): From d569a8845254d90ce13ad74ae76695e8d6441068 Mon Sep 17 00:00:00 2001 From: bashonly Date: Wed, 30 Oct 2024 13:41:26 -0500 Subject: [PATCH 09/30] [ie/youtube] Adjust OAuth refresh token handling (#11414) Removes support for using '' as an empty password in netrc, e.g.: machine youtube login oauth password '' Double-quotes ("") are valid and must be used instead, e.g.: machine youtube login oauth password "" Authored by: bashonly --- yt_dlp/extractor/youtube.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 5148e8261900..99b8bfecc92f 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -644,13 +644,14 @@ def _initialize_oauth(self, user, refresh_token): YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE] = {} if refresh_token: - refresh_token = refresh_token.strip('\'') or None - - # Allow refresh token passed to initialize cache - if refresh_token: + msg = f'{self._OAUTH_DISPLAY_ID}: Using password input as refresh token' + if self.get_param('cachedir') is not False: + msg += ' and caching token to disk; you should supply an empty password next time' + self.to_screen(msg) self.cache.store(self._NETRC_MACHINE, self._oauth_cache_key, refresh_token) + else: + refresh_token = self.cache.load(self._NETRC_MACHINE, self._oauth_cache_key) - refresh_token = refresh_token or self.cache.load(self._NETRC_MACHINE, self._oauth_cache_key) if refresh_token: YoutubeBaseInfoExtractor._OAUTH_ACCESS_TOKEN_CACHE[self._OAUTH_PROFILE]['refresh_token'] = refresh_token try: From 76802f461332d444e596437c42374fa237fa5174 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 30 Oct 2024 19:26:28 +0000 Subject: [PATCH 10/30] [ie/twitter] Remove cookies migration workaround (#11392) Closes #11338 Authored by: bashonly --- yt_dlp/extractor/twitter.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index 5adaf16393c5..8196ce6c328b 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -150,14 +150,6 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) - # XXX: Temporary workaround until twitter.com => x.com migration is completed - def _real_initialize(self): - if self.is_logged_in or not self._get_cookies('https://twitter.com/').get('auth_token'): - return - # User has not yet been migrated to x.com and has passed twitter.com cookies - TwitterBaseIE._API_BASE = 'https://api.twitter.com/1.1/' - TwitterBaseIE._GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' - @functools.cached_property def _selected_api(self): return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] From b6dc2c49e8793c6dfa21275e61caf49ec1148b81 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 30 Oct 2024 21:53:41 +0000 Subject: [PATCH 11/30] [utils] Allow partial application for more functions (#11391) Also adds the `trim_str` traversal helper Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- test/test_traversal.py | 17 +++++++++++++++- test/test_utils.py | 14 +++++++++++++ yt_dlp/utils/_utils.py | 43 ++++++++++++++++++++++++--------------- yt_dlp/utils/traversal.py | 14 +++++++++++++ 4 files changed, 71 insertions(+), 17 deletions(-) diff --git a/test/test_traversal.py b/test/test_traversal.py index 9179dadda47c..f1d123bd6e5f 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -12,9 +12,10 @@ str_or_none, ) from yt_dlp.utils.traversal import ( - traverse_obj, require, subs_list_to_dict, + traverse_obj, + trim_str, ) _TEST_DATA = { @@ -495,6 +496,20 @@ def test_subs_list_to_dict(self): {'url': 'https://example.com/subs/en2', 'ext': 'ext'}, ]}, '`quality` key should sort subtitle list accordingly' + def test_trim_str(self): + with pytest.raises(TypeError): + trim_str('positional') + + assert callable(trim_str(start='a')) + assert trim_str(start='ab')('abc') == 'c' + assert trim_str(end='bc')('abc') == 'a' + assert trim_str(start='a', end='c')('abc') == 'b' + assert trim_str(start='ab', end='c')('abc') == '' + assert trim_str(start='a', end='bc')('abc') == '' + assert trim_str(start='ab', end='bc')('abc') == '' + assert trim_str(start='abc', end='abc')('abc') == '' + assert trim_str(start='', end='')('abc') == 'abc' + class TestDictGet: def test_dict_get(self): diff --git a/test/test_utils.py b/test/test_utils.py index d4b846f56fba..04f91547a4c2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import os import sys import unittest +import unittest.mock import warnings import datetime as dt @@ -71,6 +72,7 @@ intlist_to_bytes, iri_to_uri, is_html, + join_nonempty, js_to_json, limit_length, locked_file, @@ -343,11 +345,13 @@ def test_remove_start(self): self.assertEqual(remove_start(None, 'A - '), None) self.assertEqual(remove_start('A - B', 'A - '), 'B') self.assertEqual(remove_start('B - A', 'A - '), 'B - A') + self.assertEqual(remove_start('non-empty', ''), 'non-empty') def test_remove_end(self): self.assertEqual(remove_end(None, ' - B'), None) self.assertEqual(remove_end('A - B', ' - B'), 'A') self.assertEqual(remove_end('B - A', ' - B'), 'B - A') + self.assertEqual(remove_end('non-empty', ''), 'non-empty') def test_remove_quotes(self): self.assertEqual(remove_quotes(None), None) @@ -2148,6 +2152,16 @@ def run_shell(args): assert run_shell(args) == expected assert run_shell(shell_quote(args, shell=True)) == expected + def test_partial_application(self): + assert callable(int_or_none(scale=10)), 'missing positional parameter should apply partially' + assert int_or_none(10, scale=0.1) == 100, 'positionally passed argument should call function' + assert int_or_none(v=10) == 10, 'keyword passed positional should call function' + assert int_or_none(scale=0.1)(10) == 100, 'call after partial applicatino should call the function' + + assert callable(join_nonempty(delim=', ')), 'varargs positional should apply partially' + assert callable(join_nonempty()), 'varargs positional should apply partially' + assert join_nonempty(None, delim=', ') == '', 'passed varargs should call the function' + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 8535d28307b3..e30008e931a7 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -212,6 +212,23 @@ def write_json_file(obj, fn): raise +def partial_application(func): + sig = inspect.signature(func) + required_args = [ + param.name for param in sig.parameters.values() + if param.kind in (inspect.Parameter.POSITIONAL_ONLY, inspect.Parameter.POSITIONAL_OR_KEYWORD, inspect.Parameter.VAR_POSITIONAL) + if param.default is inspect.Parameter.empty + ] + + @functools.wraps(func) + def wrapped(*args, **kwargs): + if set(required_args[len(args):]).difference(kwargs): + return functools.partial(func, *args, **kwargs) + return func(*args, **kwargs) + + return wrapped + + def find_xpath_attr(node, xpath, key, val=None): """ Find the xpath xpath[@key=val] """ assert re.match(r'^[a-zA-Z_-]+$', key) @@ -1192,6 +1209,7 @@ def extract_timezone(date_str, default=None): return timezone, date_str +@partial_application def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -1269,6 +1287,7 @@ def unified_timestamp(date_str, day_first=True): return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() +@partial_application def determine_ext(url, default_ext='unknown_video'): if url is None or '.' not in url: return default_ext @@ -1944,7 +1963,7 @@ def remove_start(s, start): def remove_end(s, end): - return s[:-len(end)] if s is not None and s.endswith(end) else s + return s[:-len(end)] if s is not None and end and s.endswith(end) else s def remove_quotes(s): @@ -1973,6 +1992,7 @@ def base_url(url): return re.match(r'https?://[^?#]+/', url).group() +@partial_application def urljoin(base, path): if isinstance(path, bytes): path = path.decode() @@ -1988,21 +2008,6 @@ def urljoin(base, path): return urllib.parse.urljoin(base, path) -def partial_application(func): - sig = inspect.signature(func) - - @functools.wraps(func) - def wrapped(*args, **kwargs): - try: - sig.bind(*args, **kwargs) - except TypeError: - return functools.partial(func, *args, **kwargs) - else: - return func(*args, **kwargs) - - return wrapped - - @partial_application def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1, base=None): if get_attr and v is not None: @@ -2583,6 +2588,7 @@ def urlencode_postdata(*args, **kargs): return urllib.parse.urlencode(*args, **kargs).encode('ascii') +@partial_application def update_url(url, *, query_update=None, **kwargs): """Replace URL components specified by kwargs @param url str or parse url tuple @@ -2603,6 +2609,7 @@ def update_url(url, *, query_update=None, **kwargs): return urllib.parse.urlunparse(url._replace(**kwargs)) +@partial_application def update_url_query(url, query): return update_url(url, query_update=query) @@ -2924,6 +2931,7 @@ def error_to_str(err): return f'{type(err).__name__}: {err}' +@partial_application def mimetype2ext(mt, default=NO_DEFAULT): if not isinstance(mt, str): if default is not NO_DEFAULT: @@ -4664,6 +4672,7 @@ def to_high_limit_path(path): return path +@partial_application def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traversal.traverse_obj(obj, *variadic(field)) if not val if ignore is NO_DEFAULT else val in variadic(ignore): @@ -4828,6 +4837,7 @@ def number_of_digits(number): return len('%d' % number) +@partial_application def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: values = (traversal.traverse_obj(from_dict, variadic(v)) for v in values) @@ -5278,6 +5288,7 @@ def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffi time.sleep(delay) +@partial_application def make_archive_id(ie, video_id): ie_key = ie if isinstance(ie, str) else ie.ie_key() return f'{ie_key.lower()} {video_id}' diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 0eef817eaace..dd9b4690beb2 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -435,6 +435,20 @@ def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): return functools.partial(func, cls) +def trim_str(*, start=None, end=None): + def trim(s): + if s is None: + return None + start_idx = 0 + if start and s.startswith(start): + start_idx = len(start) + if end and s.endswith(end): + return s[start_idx:-len(end)] + return s[start_idx:] + + return trim + + def get_first(obj, *paths, **kwargs): return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) From 428ffb75aa3534b275cf54de42693a4d261519da Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 31 Oct 2024 09:00:08 +0000 Subject: [PATCH 12/30] [build] Disable attestations for trusted publishing (#11418) Currently does not work with reusable workflows, e.g. release-nightly.yml calling release.yml Ref: https://github.com/pypa/gh-action-pypi-publish/releases/tag/v1.11.0 https://github.com/pypa/gh-action-pypi-publish/discussions/255 https://github.com/pypi/warehouse/issues/11096 Authored by: bashonly --- .github/workflows/release.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 8d0bc4026a1b..2bc09c64d0f0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -282,6 +282,7 @@ jobs: uses: pypa/gh-action-pypi-publish@release/v1 with: verbose: true + attestations: false # Currently doesn't work w/ reusable workflows (breaks nightly) publish: needs: [prepare, build] From a6783a3b9905e547f6c1d4df9d7c7999feda8afa Mon Sep 17 00:00:00 2001 From: "Nicolas F." Date: Fri, 1 Nov 2024 00:23:42 +0100 Subject: [PATCH 13/30] [ie/yle_areena] Support live events (#11358) Authored by: CounterPillow, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/yle_areena.py | 158 +++++++++++++++++++++------------ 1 file changed, 101 insertions(+), 57 deletions(-) diff --git a/yt_dlp/extractor/yle_areena.py b/yt_dlp/extractor/yle_areena.py index c0a218e2fc98..c2daddfa6c37 100644 --- a/yt_dlp/extractor/yle_areena.py +++ b/yt_dlp/extractor/yle_areena.py @@ -1,12 +1,13 @@ from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( + ExtractorError, int_or_none, + parse_iso8601, smuggle_url, - traverse_obj, - unified_strdate, url_or_none, ) +from ..utils.traversal import traverse_obj class YleAreenaIE(InfoExtractor): @@ -15,9 +16,9 @@ class YleAreenaIE(InfoExtractor): _TESTS = [ { 'url': 'https://areena.yle.fi/1-4371942', - 'md5': '932edda0ecf5dfd6423804182d32f8ac', + 'md5': 'd87e9a1e74e67e009990ddd413e426b4', 'info_dict': { - 'id': '0_a3tjk92c', + 'id': '1-4371942', 'ext': 'mp4', 'title': 'Pouchit', 'description': 'md5:01071d7056ceec375f63960f90c35366', @@ -26,37 +27,27 @@ class YleAreenaIE(InfoExtractor): 'season_number': 1, 'episode': 'Episode 2', 'episode_number': 2, - 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061', - 'uploader_id': 'ovp@yle.fi', - 'duration': 1435, - 'view_count': int, - 'upload_date': '20181204', - 'release_date': '20190106', - 'timestamp': 1543916210, - 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]}, + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', 'age_limit': 7, - 'webpage_url': 'https://areena.yle.fi/1-4371942', + 'release_date': '20190105', + 'release_timestamp': 1546725660, + 'duration': 1435, }, }, { 'url': 'https://areena.yle.fi/1-2158940', - 'md5': 'cecb603661004e36af8c5188b5212b12', + 'md5': '6369ddc5e07b5fdaeda27a495184143c', 'info_dict': { - 'id': '1_l38iz9ur', + 'id': '1-2158940', 'ext': 'mp4', 'title': 'Albi haluaa vessan', - 'description': 'md5:15236d810c837bed861fae0e88663c33', + 'description': 'Albi haluaa vessan.', 'series': 'Albi Lumiukko', - 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/1_l38iz9ur/version/100021', - 'uploader_id': 'ovp@yle.fi', - 'duration': 319, - 'view_count': int, - 'upload_date': '20211202', - 'release_date': '20211215', - 'timestamp': 1638448202, - 'subtitles': {}, + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', 'age_limit': 0, - 'webpage_url': 'https://areena.yle.fi/1-2158940', + 'release_date': '20211215', + 'release_timestamp': 1639555200, + 'duration': 319, }, }, { @@ -67,72 +58,125 @@ class YleAreenaIE(InfoExtractor): 'title': 'HKO & Mälkki & Tanner', 'description': 'md5:b4f1b1af2c6569b33f75179a86eea156', 'series': 'Helsingin kaupunginorkesterin konsertteja', - 'thumbnail': r're:^https?://.+\.jpg$', + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', 'release_date': '20230120', + 'release_timestamp': 1674242079, + 'duration': 8004, }, 'params': { 'skip_download': 'm3u8', }, }, + { + 'url': 'https://areena.yle.fi/1-72251830', + 'info_dict': { + 'id': '1-72251830', + 'ext': 'mp4', + 'title': r're:Pentulive 2024 | Pentulive \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'md5:1f118707d9093bf894a34fbbc865397b', + 'series': 'Pentulive', + 'thumbnail': r're:https://images\.cdn\.yle\.fi/image/upload/.+\.jpg', + 'live_status': 'is_live', + 'release_date': '20241025', + 'release_timestamp': 1729875600, + }, + 'params': { + 'skip_download': 'livestream', + }, + }, + { + 'url': 'https://areena.yle.fi/podcastit/1-71022852', + 'info_dict': { + 'id': '1-71022852', + 'ext': 'mp3', + 'title': 'Värityspäivä', + 'description': 'md5:c3a02b0455ec71d32cbe09d32ec161e2', + 'series': 'Murun ja Paukun ikioma kaupunki', + 'episode': 'Episode 1', + 'episode_number': 1, + 'release_date': '20240607', + 'release_timestamp': 1717736400, + 'duration': 442, + }, + }, ] def _real_extract(self, url): video_id, is_podcast = self._match_valid_url(url).group('id', 'podcast') - info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) + json_ld = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) video_data = self._download_json( f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', video_id, headers={ 'origin': 'https://areena.yle.fi', 'referer': 'https://areena.yle.fi/', 'content-type': 'application/json', - }) + })['data'] # Example title: 'K1, J2: Pouchit | Modernit miehet' season_number, episode_number, episode, series = self._search_regex( r'K(?P\d+),\s*J(?P\d+):?\s*\b(?P[^|]+)\s*|\s*(?P.+)', - info.get('title') or '', 'episode metadata', group=('season_no', 'episode_no', 'episode', 'series'), + json_ld.get('title') or '', 'episode metadata', group=('season_no', 'episode_no', 'episode', 'series'), default=(None, None, None, None)) - description = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'description', 'fin'), expected_type=str) + description = traverse_obj(video_data, ('ongoing_ondemand', 'description', 'fin', {str})) subtitles = {} - for sub in traverse_obj(video_data, ('data', 'ongoing_ondemand', 'subtitles', ...)): - if url_or_none(sub.get('uri')): - subtitles.setdefault(sub.get('language') or 'und', []).append({ - 'url': sub['uri'], - 'ext': 'srt', - 'name': sub.get('kind'), - }) + for sub in traverse_obj(video_data, ('ongoing_ondemand', 'subtitles', lambda _, v: url_or_none(v['uri']))): + subtitles.setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['uri'], + 'ext': 'srt', + 'name': sub.get('kind'), + }) - if is_podcast: - info_dict = { - 'url': video_data['data']['ongoing_ondemand']['media_url'], - } - elif kaltura_id := traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id', {str})): - info_dict = { + info_dict, metadata = {}, {} + if is_podcast and traverse_obj(video_data, ('ongoing_ondemand', 'media_url', {url_or_none})): + metadata = video_data['ongoing_ondemand'] + info_dict['url'] = metadata['media_url'] + elif traverse_obj(video_data, ('ongoing_event', 'manifest_url', {url_or_none})): + metadata = video_data['ongoing_event'] + metadata.pop('duration', None) # Duration is not accurate for livestreams + info_dict['live_status'] = 'is_live' + elif traverse_obj(video_data, ('ongoing_ondemand', 'manifest_url', {url_or_none})): + metadata = video_data['ongoing_ondemand'] + # XXX: Has all externally-hosted Kaltura content been moved to native hosting? + elif kaltura_id := traverse_obj(video_data, ('ongoing_ondemand', 'kaltura', 'id', {str})): + metadata = video_data['ongoing_ondemand'] + info_dict.update({ '_type': 'url_transparent', 'url': smuggle_url(f'kaltura:1955031:{kaltura_id}', {'source_url': url}), 'ie_key': KalturaIE.ie_key(), - } + }) + elif traverse_obj(video_data, ('gone', {dict})): + self.raise_no_formats('The content is no longer available', expected=True, video_id=video_id) + metadata = video_data['gone'] else: - formats, subs = self._extract_m3u8_formats_and_subtitles( - video_data['data']['ongoing_ondemand']['manifest_url'], video_id, 'mp4', m3u8_id='hls') + raise ExtractorError('Unable to extract content') + + if not info_dict.get('url') and metadata.get('manifest_url'): + info_dict['formats'], subs = self._extract_m3u8_formats_and_subtitles( + metadata['manifest_url'], video_id, 'mp4', m3u8_id='hls') self._merge_subtitles(subs, target=subtitles) - info_dict = {'formats': formats} return { - **info_dict, + **traverse_obj(json_ld, { + 'title': 'title', + 'thumbnails': ('thumbnails', ..., {'url': 'url'}), + }), 'id': video_id, - 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) - or episode or info.get('title')), + 'title': episode, 'description': description, - 'series': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'series', 'title', 'fin'), expected_type=str) - or series), + 'series': series, 'season_number': (int_or_none(self._search_regex(r'Kausi (\d+)', description, 'season number', default=None)) or int_or_none(season_number)), - 'episode_number': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'episode_number'), expected_type=int_or_none) - or int_or_none(episode_number)), - 'thumbnails': traverse_obj(info, ('thumbnails', ..., {'url': 'url'})), - 'age_limit': traverse_obj(video_data, ('data', 'ongoing_ondemand', 'content_rating', 'age_restriction'), expected_type=int_or_none), + 'episode_number': int_or_none(episode_number), 'subtitles': subtitles or None, - 'release_date': unified_strdate(traverse_obj(video_data, ('data', 'ongoing_ondemand', 'start_time'), expected_type=str)), + **traverse_obj(metadata, { + 'title': ('title', 'fin', {str}), + 'description': ('description', 'fin', {str}), + 'series': ('series', 'title', 'fin', {str}), + 'episode_number': ('episode_number', {int_or_none}), + 'age_limit': ('content_rating', 'age_restriction', {int_or_none}), + 'release_timestamp': ('start_time', {parse_iso8601}), + 'duration': ('duration', 'duration_in_seconds', {int_or_none}), + }), + **info_dict, } From 422195ec70a00b0d2002b238cacbae7790c57fdf Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 2 Nov 2024 21:42:00 +0100 Subject: [PATCH 14/30] [utils] Allow partial application for even more functions (#11437) Fixes b6dc2c49e8793c6dfa21275e61caf49ec1148b81 Authored by: Grub4K --- test/test_traversal.py | 11 +++++++++++ yt_dlp/utils/_utils.py | 1 + yt_dlp/utils/traversal.py | 8 ++++++++ 3 files changed, 20 insertions(+) diff --git a/test/test_traversal.py b/test/test_traversal.py index f1d123bd6e5f..1c0cc5362cc6 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -9,6 +9,7 @@ determine_ext, dict_get, int_or_none, + join_nonempty, str_or_none, ) from yt_dlp.utils.traversal import ( @@ -16,6 +17,7 @@ subs_list_to_dict, traverse_obj, trim_str, + unpack, ) _TEST_DATA = { @@ -510,6 +512,15 @@ def test_trim_str(self): assert trim_str(start='abc', end='abc')('abc') == '' assert trim_str(start='', end='')('abc') == 'abc' + def test_unpack(self): + assert unpack(lambda *x: ''.join(map(str, x)))([1, 2, 3]) == '123' + assert unpack(join_nonempty)([1, 2, 3]) == '1-2-3' + assert unpack(join_nonempty(delim=' '))([1, 2, 3]) == '1 2 3' + with pytest.raises(TypeError): + unpack(join_nonempty)() + with pytest.raises(TypeError): + unpack() + class TestDictGet: def test_dict_get(self): diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index e30008e931a7..2f4c0a00f614 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5294,6 +5294,7 @@ def make_archive_id(ie, video_id): return f'{ie_key.lower()} {video_id}' +@partial_application def truncate_string(s, left, right=0): assert left > 3 and right >= 0 if s is None or len(s) <= left + right: diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index dd9b4690beb2..bc313d5c423c 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -449,6 +449,14 @@ def trim(s): return trim +def unpack(func): + @functools.wraps(func) + def inner(items, **kwargs): + return func(*items, **kwargs) + + return inner + + def get_first(obj, *paths, **kwargs): return traverse_obj(obj, *((..., *variadic(keys)) for keys in paths), **kwargs, get_all=False) From 5c7a5aaab27e9c3cb367b663a6136ca58866e547 Mon Sep 17 00:00:00 2001 From: Willow <108599378+MellowKyler@users.noreply.github.com> Date: Sun, 3 Nov 2024 02:07:25 -0600 Subject: [PATCH 15/30] [ie/Bluesky] Add extractor (#11055) Closes #10987 Authored by: MellowKyler, seproDev Co-authored-by: sepro --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/bluesky.py | 388 ++++++++++++++++++++++++++++++++ 2 files changed, 389 insertions(+) create mode 100644 yt_dlp/extractor/bluesky.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 13b5633d4627..ff5ddb2493bd 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -278,6 +278,7 @@ from .blerp import BlerpIE from .blogger import BloggerIE from .bloomberg import BloombergIE +from .bluesky import BlueskyIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE from .boosty import BoostyIE diff --git a/yt_dlp/extractor/bluesky.py b/yt_dlp/extractor/bluesky.py new file mode 100644 index 000000000000..42edd1107a99 --- /dev/null +++ b/yt_dlp/extractor/bluesky.py @@ -0,0 +1,388 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + format_field, + int_or_none, + mimetype2ext, + orderedSet, + parse_iso8601, + truncate_string, + update_url_query, + url_basename, + url_or_none, + variadic, +) +from ..utils.traversal import traverse_obj + + +class BlueskyIE(InfoExtractor): + _VALID_URL = [ + r'https?://(?:www\.)?(?:bsky\.app|main\.bsky\.dev)/profile/(?P[\w.:%-]+)/post/(?P\w+)', + r'at://(?P[\w.:%-]+)/app\.bsky\.feed\.post/(?P\w+)', + ] + _TESTS = [{ + 'url': 'https://bsky.app/profile/blu3blue.bsky.social/post/3l4omssdl632g', + 'md5': '375539c1930ab05d15585ed772ab54fd', + 'info_dict': { + 'id': '3l4omssdl632g', + 'ext': 'mp4', + 'uploader': 'Blu3Blu3Lilith', + 'uploader_id': 'blu3blue.bsky.social', + 'uploader_url': 'https://bsky.app/profile/blu3blue.bsky.social', + 'channel_id': 'did:plc:pzdr5ylumf7vmvwasrpr5bf2', + 'channel_url': 'https://bsky.app/profile/did:plc:pzdr5ylumf7vmvwasrpr5bf2', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'OMG WE HAVE VIDEOS NOW', + 'description': 'OMG WE HAVE VIDEOS NOW', + 'upload_date': '20240921', + 'timestamp': 1726940605, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/bsky.app/post/3l3vgf77uco2g', + 'md5': 'b9e344fdbce9f2852c668a97efefb105', + 'info_dict': { + 'id': '3l3vgf77uco2g', + 'ext': 'mp4', + 'uploader': 'Bluesky', + 'uploader_id': 'bsky.app', + 'uploader_url': 'https://bsky.app/profile/bsky.app', + 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', + 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky now has video! Update your app to versi...', + 'alt_title': 'Bluesky video feature announcement', + 'description': r're:(?s)Bluesky now has video! .{239}', + 'upload_date': '20240911', + 'timestamp': 1726074716, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + 'subtitles': { + 'en': 'mincount:1', + }, + }, + }, { + 'url': 'https://main.bsky.dev/profile/souris.moe/post/3l4qhp7bcs52c', + 'md5': '5f2df8c200b5633eb7fb2c984d29772f', + 'info_dict': { + 'id': '3l4qhp7bcs52c', + 'ext': 'mp4', + 'uploader': 'souris', + 'uploader_id': 'souris.moe', + 'uploader_url': 'https://bsky.app/profile/souris.moe', + 'channel_id': 'did:plc:tj7g244gl5v6ai6cm4f4wlqp', + 'channel_url': 'https://bsky.app/profile/did:plc:tj7g244gl5v6ai6cm4f4wlqp', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky video #3l4qhp7bcs52c', + 'upload_date': '20240922', + 'timestamp': 1727003838, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/de1.pds.tentacle.expert/post/3l3w4tnezek2e', + 'md5': '1af9c7fda061cf7593bbffca89e43d1c', + 'info_dict': { + 'id': '3l3w4tnezek2e', + 'ext': 'mp4', + 'uploader': 'clean', + 'uploader_id': 'de1.pds.tentacle.expert', + 'uploader_url': 'https://bsky.app/profile/de1.pds.tentacle.expert', + 'channel_id': 'did:web:de1.tentacle.expert', + 'channel_url': 'https://bsky.app/profile/did:web:de1.tentacle.expert', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky video #3l3w4tnezek2e', + 'upload_date': '20240911', + 'timestamp': 1726098823, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/yunayuispink.bsky.social/post/3l7gqcfes742o', + 'info_dict': { + 'id': 'XxK3t_5V3ao', + 'ext': 'mp4', + 'uploader': 'yunayu', + 'uploader_id': '@yunayuispink', + 'uploader_url': 'https://www.youtube.com/@yunayuispink', + 'channel': 'yunayu', + 'channel_id': 'UCPLvXnHa7lTyNoR_dGsU14w', + 'channel_url': 'https://www.youtube.com/channel/UCPLvXnHa7lTyNoR_dGsU14w', + 'thumbnail': 'https://i.ytimg.com/vi_webp/XxK3t_5V3ao/maxresdefault.webp', + 'description': r're:Have a good goodx10000day', + 'title': '5min vs 5hours drawing', + 'availability': 'public', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'upload_date': '20241026', + 'timestamp': 1729967784, + 'duration': 321, + 'age_limit': 0, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'channel_follower_count': int, + 'categories': ['Entertainment'], + 'tags': [], + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://bsky.app/profile/endshark.bsky.social/post/3jzxjkcemae2m', + 'info_dict': { + 'id': '222792849', + 'ext': 'mp3', + 'uploader': 'LASERBAT', + 'uploader_id': 'laserbatx', + 'uploader_url': 'https://laserbatx.bandcamp.com', + 'artists': ['LASERBAT'], + 'album_artists': ['LASERBAT'], + 'album': 'Hari Nezumi [EP]', + 'track': 'Forward to the End', + 'title': 'LASERBAT - Forward to the End', + 'thumbnail': 'https://f4.bcbits.com/img/a2507705510_5.jpg', + 'duration': 228.571, + 'track_id': '222792849', + 'release_date': '20230423', + 'upload_date': '20230423', + 'timestamp': 1682276040.0, + 'release_timestamp': 1682276040.0, + 'track_number': 1, + }, + 'add_ie': ['Bandcamp'], + }, { + 'url': 'https://bsky.app/profile/dannybhoix.bsky.social/post/3l6oe5mtr2c2j', + 'md5': 'b9e344fdbce9f2852c668a97efefb105', + 'info_dict': { + 'id': '3l3vgf77uco2g', + 'ext': 'mp4', + 'uploader': 'Bluesky', + 'uploader_id': 'bsky.app', + 'uploader_url': 'https://bsky.app/profile/bsky.app', + 'channel_id': 'did:plc:z72i7hdynmk6r22z27h6tvur', + 'channel_url': 'https://bsky.app/profile/did:plc:z72i7hdynmk6r22z27h6tvur', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky now has video! Update your app to versi...', + 'alt_title': 'Bluesky video feature announcement', + 'description': r're:(?s)Bluesky now has video! .{239}', + 'upload_date': '20240911', + 'timestamp': 1726074716, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + 'subtitles': { + 'en': 'mincount:1', + }, + }, + }, { + 'url': 'https://bsky.app/profile/alt.bun.how/post/3l7rdfxhyds2f', + 'md5': '8775118b235cf9fa6b5ad30f95cda75c', + 'info_dict': { + 'id': '3l7rdfxhyds2f', + 'ext': 'mp4', + 'uploader': 'cinnamon', + 'uploader_id': 'alt.bun.how', + 'uploader_url': 'https://bsky.app/profile/alt.bun.how', + 'channel_id': 'did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'channel_url': 'https://bsky.app/profile/did:plc:7x6rtuenkuvxq3zsvffp2ide', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'crazy that i look like this tbh', + 'description': 'crazy that i look like this tbh', + 'upload_date': '20241030', + 'timestamp': 1730332128, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': ['sexual'], + 'age_limit': 18, + }, + }, { + 'url': 'at://did:plc:ia76kvnndjutgedggx2ibrem/app.bsky.feed.post/3l6zrz6zyl2dr', + 'md5': '71b0eb6d85d03145e6af6642c7fc6d78', + 'info_dict': { + 'id': '3l6zrz6zyl2dr', + 'ext': 'mp4', + 'uploader': 'mary🐇', + 'uploader_id': 'mary.my.id', + 'uploader_url': 'https://bsky.app/profile/mary.my.id', + 'channel_id': 'did:plc:ia76kvnndjutgedggx2ibrem', + 'channel_url': 'https://bsky.app/profile/did:plc:ia76kvnndjutgedggx2ibrem', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'title': 'Bluesky video #3l6zrz6zyl2dr', + 'upload_date': '20241021', + 'timestamp': 1729523172, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'tags': [], + }, + }, { + 'url': 'https://bsky.app/profile/purpleicetea.bsky.social/post/3l7gv55dc2o2w', + 'info_dict': { + 'id': '3l7gv55dc2o2w', + }, + 'playlist': [{ + 'info_dict': { + 'id': '3l7gv55dc2o2w', + 'ext': 'mp4', + 'upload_date': '20241026', + 'description': 'One of my favorite videos', + 'comment_count': int, + 'uploader_url': 'https://bsky.app/profile/purpleicetea.bsky.social', + 'uploader': 'Purple.Ice.Tea', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'channel_url': 'https://bsky.app/profile/did:plc:bjh5ffwya5f53dfy47dezuwx', + 'like_count': int, + 'channel_id': 'did:plc:bjh5ffwya5f53dfy47dezuwx', + 'repost_count': int, + 'timestamp': 1729973202, + 'tags': [], + 'uploader_id': 'purpleicetea.bsky.social', + 'title': 'One of my favorite videos', + }, + }, { + 'info_dict': { + 'id': '3l77u64l7le2e', + 'ext': 'mp4', + 'title': 'hearing people on twitter say that bluesky isn\'...', + 'like_count': int, + 'uploader_id': 'thafnine.net', + 'uploader_url': 'https://bsky.app/profile/thafnine.net', + 'upload_date': '20241024', + 'channel_url': 'https://bsky.app/profile/did:plc:6ttyq36rhiyed7wu3ws7dmqj', + 'description': r're:(?s)hearing people on twitter say that bluesky .{93}', + 'tags': [], + 'alt_title': 'md5:9b1ee1937fb3d1a81e932f9ec14d560e', + 'uploader': 'T9', + 'channel_id': 'did:plc:6ttyq36rhiyed7wu3ws7dmqj', + 'thumbnail': r're:https://video.bsky.app/watch/.*\.jpg$', + 'timestamp': 1729731642, + 'comment_count': int, + 'repost_count': int, + }, + }], + }] + _BLOB_URL_TMPL = '{}/xrpc/com.atproto.sync.getBlob' + + def _get_service_endpoint(self, did, video_id): + if did.startswith('did:web:'): + url = f'https://{did[8:]}/.well-known/did.json' + else: + url = f'https://plc.directory/{did}' + services = self._download_json( + url, video_id, 'Fetching service endpoint', 'Falling back to bsky.social', fatal=False) + return traverse_obj( + services, ('service', lambda _, x: x['type'] == 'AtprotoPersonalDataServer', + 'serviceEndpoint', {url_or_none}, any)) or 'https://bsky.social' + + def _real_extract(self, url): + handle, video_id = self._match_valid_url(url).group('handle', 'id') + + post = self._download_json( + 'https://public.api.bsky.app/xrpc/app.bsky.feed.getPostThread', + video_id, query={ + 'uri': f'at://{handle}/app.bsky.feed.post/{video_id}', + 'depth': 0, + 'parentHeight': 0, + })['thread']['post'] + + entries = [] + # app.bsky.embed.video.view/app.bsky.embed.external.view + entries.extend(self._extract_videos(post, video_id)) + # app.bsky.embed.recordWithMedia.view + entries.extend(self._extract_videos( + post, video_id, embed_path=('embed', 'media'), record_subpath=('embed', 'media'))) + # app.bsky.embed.record.view + if nested_post := traverse_obj(post, ('embed', 'record', ('record', None), {dict}, any)): + entries.extend(self._extract_videos( + nested_post, video_id, embed_path=('embeds', 0), record_path='value')) + + if not entries: + raise ExtractorError('No video could be found in this post', expected=True) + if len(entries) == 1: + return entries[0] + return self.playlist_result(entries, video_id) + + @staticmethod + def _build_profile_url(path): + return format_field(path, None, 'https://bsky.app/profile/%s', default=None) + + def _extract_videos(self, root, video_id, embed_path='embed', record_path='record', record_subpath='embed'): + embed_path = variadic(embed_path, (str, bytes, dict, set)) + record_path = variadic(record_path, (str, bytes, dict, set)) + record_subpath = variadic(record_subpath, (str, bytes, dict, set)) + + entries = [] + if external_uri := traverse_obj(root, ( + ((*record_path, *record_subpath), embed_path), 'external', 'uri', {url_or_none}, any)): + entries.append(self.url_result(external_uri)) + if playlist := traverse_obj(root, (*embed_path, 'playlist', {url_or_none})): + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + playlist, video_id, 'mp4', m3u8_id='hls', fatal=False) + else: + return entries + + video_cid = traverse_obj( + root, (*embed_path, 'cid', {str}), + (*record_path, *record_subpath, 'video', 'ref', '$link', {str})) + did = traverse_obj(root, ('author', 'did', {str})) + + if did and video_cid: + endpoint = self._get_service_endpoint(did, video_id) + + formats.append({ + 'format_id': 'blob', + 'url': update_url_query( + self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': video_cid}), + **traverse_obj(root, (*embed_path, 'aspectRatio', { + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + })), + **traverse_obj(root, (*record_path, *record_subpath, 'video', { + 'filesize': ('size', {int_or_none}), + 'ext': ('mimeType', {mimetype2ext}), + })), + }) + + for sub_data in traverse_obj(root, ( + *record_path, *record_subpath, 'captions', lambda _, v: v['file']['ref']['$link'])): + subtitles.setdefault(sub_data.get('lang') or 'und', []).append({ + 'url': update_url_query( + self._BLOB_URL_TMPL.format(endpoint), {'did': did, 'cid': sub_data['file']['ref']['$link']}), + 'ext': traverse_obj(sub_data, ('file', 'mimeType', {mimetype2ext})), + }) + + entries.append({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(root, { + 'id': ('uri', {url_basename}), + 'thumbnail': (*embed_path, 'thumbnail', {url_or_none}), + 'alt_title': (*embed_path, 'alt', {str}, filter), + 'uploader': ('author', 'displayName', {str}), + 'uploader_id': ('author', 'handle', {str}), + 'uploader_url': ('author', 'handle', {self._build_profile_url}), + 'channel_id': ('author', 'did', {str}), + 'channel_url': ('author', 'did', {self._build_profile_url}), + 'like_count': ('likeCount', {int_or_none}), + 'repost_count': ('repostCount', {int_or_none}), + 'comment_count': ('replyCount', {int_or_none}), + 'timestamp': ('indexedAt', {parse_iso8601}), + 'tags': ('labels', ..., 'val', {str}, all, {orderedSet}), + 'age_limit': ( + 'labels', ..., 'val', {lambda x: 18 if x in ('sexual', 'porn', 'graphic-media') else None}, any), + 'description': (*record_path, 'text', {str}, filter), + 'title': (*record_path, 'text', {lambda x: x.replace('\n', '')}, {truncate_string(left=50)}), + }), + }) + return entries From b103aca24d35b72b405c340357dc01a0ed534281 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 3 Nov 2024 18:19:45 +0000 Subject: [PATCH 16/30] [utils] Fix and improve `find_element` and `find_elements` (#11443) Fix d710a6ca7c622705c0c8c8a3615916f531137d5d Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- test/test_traversal.py | 54 +++++++++++++++++++++++++++++++++++++++ yt_dlp/utils/traversal.py | 23 +++++++++-------- 2 files changed, 67 insertions(+), 10 deletions(-) diff --git a/test/test_traversal.py b/test/test_traversal.py index 1c0cc5362cc6..cc0228d2703a 100644 --- a/test/test_traversal.py +++ b/test/test_traversal.py @@ -13,6 +13,8 @@ str_or_none, ) from yt_dlp.utils.traversal import ( + find_element, + find_elements, require, subs_list_to_dict, traverse_obj, @@ -37,6 +39,14 @@ 'dict': {}, } +_TEST_HTML = ''' +
1
+
2
+
3
+

4

+

5

+''' + class TestTraversal: def test_traversal_base(self): @@ -521,6 +531,50 @@ def test_unpack(self): with pytest.raises(TypeError): unpack() + def test_find_element(self): + for improper_kwargs in [ + dict(attr='data-id'), + dict(value='y'), + dict(attr='data-id', value='y', cls='a'), + dict(attr='data-id', value='y', id='x'), + dict(cls='a', id='x'), + dict(cls='a', tag='p'), + dict(cls='[ab]', regex=True), + ]: + with pytest.raises(AssertionError): + find_element(**improper_kwargs)(_TEST_HTML) + + assert find_element(cls='a')(_TEST_HTML) == '1' + assert find_element(cls='a', html=True)(_TEST_HTML) == '
1
' + assert find_element(id='x')(_TEST_HTML) == '2' + assert find_element(id='[ex]')(_TEST_HTML) is None + assert find_element(id='[ex]', regex=True)(_TEST_HTML) == '2' + assert find_element(id='x', html=True)(_TEST_HTML) == '
2
' + assert find_element(attr='data-id', value='y')(_TEST_HTML) == '3' + assert find_element(attr='data-id', value='y(?:es)?')(_TEST_HTML) is None + assert find_element(attr='data-id', value='y(?:es)?', regex=True)(_TEST_HTML) == '3' + assert find_element( + attr='data-id', value='y', html=True)(_TEST_HTML) == '
3
' + + def test_find_elements(self): + for improper_kwargs in [ + dict(tag='p'), + dict(attr='data-id'), + dict(value='y'), + dict(attr='data-id', value='y', cls='a'), + dict(cls='a', tag='div'), + dict(cls='[ab]', regex=True), + ]: + with pytest.raises(AssertionError): + find_elements(**improper_kwargs)(_TEST_HTML) + + assert find_elements(cls='a')(_TEST_HTML) == ['1', '2', '4'] + assert find_elements(cls='a', html=True)(_TEST_HTML) == [ + '
1
', '
2
', '

4

'] + assert find_elements(attr='custom', value='z')(_TEST_HTML) == ['2', '3'] + assert find_elements(attr='custom', value='[ez]')(_TEST_HTML) == [] + assert find_elements(attr='custom', value='[ez]', regex=True)(_TEST_HTML) == ['2', '3', '5'] + class TestDictGet: def test_dict_get(self): diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index bc313d5c423c..361f239ba675 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -20,6 +20,7 @@ get_elements_html_by_class, get_elements_html_by_attribute, get_elements_by_attribute, + get_element_by_class, get_element_html_by_attribute, get_element_by_attribute, get_element_html_by_id, @@ -373,7 +374,7 @@ def subs_list_to_dict(subs: list[dict] | None = None, /, *, ext=None): @typing.overload -def find_element(*, attr: str, value: str, tag: str | None = None, html=False): ... +def find_element(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ... @typing.overload @@ -381,14 +382,14 @@ def find_element(*, cls: str, html=False): ... @typing.overload -def find_element(*, id: str, tag: str | None = None, html=False): ... +def find_element(*, id: str, tag: str | None = None, html=False, regex=False): ... @typing.overload -def find_element(*, tag: str, html=False): ... +def find_element(*, tag: str, html=False, regex=False): ... -def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False): +def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=False, regex=False): # deliberately using `id=` and `cls=` for ease of readability assert tag or id or cls or (attr and value), 'One of tag, id, cls or (attr AND value) is required' ANY_TAG = r'[\w:.-]+' @@ -397,17 +398,18 @@ def find_element(*, tag=None, id=None, cls=None, attr=None, value=None, html=Fal assert not cls, 'Cannot match both attr and cls' assert not id, 'Cannot match both attr and id' func = get_element_html_by_attribute if html else get_element_by_attribute - return functools.partial(func, attr, value, tag=tag or ANY_TAG) + return functools.partial(func, attr, value, tag=tag or ANY_TAG, escape_value=not regex) elif cls: assert not id, 'Cannot match both cls and id' assert tag is None, 'Cannot match both cls and tag' - func = get_element_html_by_class if html else get_elements_by_class + assert not regex, 'Cannot use regex with cls' + func = get_element_html_by_class if html else get_element_by_class return functools.partial(func, cls) elif id: func = get_element_html_by_id if html else get_element_by_id - return functools.partial(func, id, tag=tag or ANY_TAG) + return functools.partial(func, id, tag=tag or ANY_TAG, escape_value=not regex) index = int(bool(html)) return lambda html: get_element_text_and_html_by_tag(tag, html)[index] @@ -418,19 +420,20 @@ def find_elements(*, cls: str, html=False): ... @typing.overload -def find_elements(*, attr: str, value: str, tag: str | None = None, html=False): ... +def find_elements(*, attr: str, value: str, tag: str | None = None, html=False, regex=False): ... -def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False): +def find_elements(*, tag=None, cls=None, attr=None, value=None, html=False, regex=False): # deliberately using `cls=` for ease of readability assert cls or (attr and value), 'One of cls or (attr AND value) is required' if attr and value: assert not cls, 'Cannot match both attr and cls' func = get_elements_html_by_attribute if html else get_elements_by_attribute - return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+') + return functools.partial(func, attr, value, tag=tag or r'[\w:.-]+', escape_value=not regex) assert not tag, 'Cannot match both cls and tag' + assert not regex, 'Cannot use regex with cls' func = get_elements_html_by_class if html else get_elements_by_class return functools.partial(func, cls) From 3945677a75e94a1fecc085432d791e1c21220cd3 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 3 Nov 2024 20:39:10 +0100 Subject: [PATCH 17/30] [core] Prioritize AV1 (#11153) Authored by: seproDev --- README.md | 12 ++++++------ yt_dlp/YoutubeDL.py | 2 +- yt_dlp/__init__.py | 3 +++ yt_dlp/extractor/youtube.py | 4 ++-- yt_dlp/options.py | 8 ++++---- yt_dlp/utils/_utils.py | 5 ++++- 6 files changed, 20 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 418203eea94d..103256f8aec7 100644 --- a/README.md +++ b/README.md @@ -1553,9 +1553,9 @@ The available fields are: All fields, unless specified otherwise, are sorted in descending order. To reverse this, prefix the field with a `+`. E.g. `+res` prefers format with the smallest resolution. Additionally, you can suffix a preferred value for the fields, separated by a `:`. E.g. `res:720` prefers larger videos, but no larger than 720p and the smallest video if there are no videos less than 720p. For `codec` and `ext`, you can provide two preferred values, the first for video and the second for audio. E.g. `+codec:avc:m4a` (equivalent to `+vcodec:avc,+acodec:m4a`) sets the video codec preference to `h264` > `h265` > `vp9` > `vp9.2` > `av01` > `vp8` > `h263` > `theora` and audio codec preference to `mp4a` > `aac` > `vorbis` > `opus` > `mp3` > `ac3` > `dts`. You can also make the sorting prefer the nearest values to the provided by using `~` as the delimiter. E.g. `filesize~1G` prefers the format with filesize closest to 1 GiB. -The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behavior can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec:vp9.2,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. +The fields `hasvid` and `ie_pref` are always given highest priority in sorting, irrespective of the user-defined order. This behavior can be changed by using `--format-sort-force`. Apart from these, the default order used is: `lang,quality,res,fps,hdr:12,vcodec,channels,acodec,size,br,asr,proto,ext,hasaud,source,id`. The extractors may override this default order, but they cannot override the user-provided order. -Note that the default has `vcodec:vp9.2`; i.e. `av1` is not preferred. Similarly, the default for hdr is `hdr:12`; i.e. Dolby Vision is not preferred. These choices are made since DV and AV1 formats are not yet fully compatible with most devices. This may be changed in the future as more devices become capable of smoothly playing back these formats. +Note that the default for hdr is `hdr:12`; i.e. Dolby Vision is not preferred. This choice was made since DV formats are not yet fully compatible with most devices. This may be changed in the future. If your format selector is `worst`, the last item is selected after sorting. This means it will select the format that is worst in all respects. Most of the time, what you actually want is the video with the smallest filesize instead. So it is generally better to use `-f best -S +size,+br,+res,+fps`. @@ -2205,7 +2205,7 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu * `avconv` is not supported as an alternative to `ffmpeg` * yt-dlp stores config files in slightly different locations to youtube-dl. See [CONFIGURATION](#configuration) for a list of correct locations * The default [output template](#output-template) is `%(title)s [%(id)s].%(ext)s`. There is no real reason for this change. This was changed before yt-dlp was ever made public and now there are no plans to change it back to `%(title)s-%(id)s.%(ext)s`. Instead, you may use `--compat-options filename` -* The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order +* The default [format sorting](#sorting-formats) is different from youtube-dl and prefers higher resolution and better codecs rather than higher bitrates. You can use the `--format-sort` option to change this to any order you prefer, or use `--compat-options format-sort` to use youtube-dl's sorting order. Older versions of yt-dlp preferred VP9 due to its broader compatibility; you can use `--compat-options prefer-vp9-sort` to revert to that format sorting preference. These two compat options cannot be used together * The default format selector is `bv*+ba/b`. This means that if a combined video + audio format that is better than the best video-only format is found, the former will be preferred. Use `-f bv+ba/b` or `--compat-options format-spec` to revert this * Unlike youtube-dlc, yt-dlp does not allow merging multiple audio/video streams into one file by default (since this conflicts with the use of `-f bv*+ba`). If needed, this feature must be enabled using `--audio-multistreams` and `--video-multistreams`. You can also use `--compat-options multistreams` to enable both * `--no-abort-on-error` is enabled by default. Use `--abort-on-error` or `--compat-options abort-on-error` to abort on errors instead @@ -2234,11 +2234,11 @@ Some of yt-dlp's default options are different from that of youtube-dl and youtu For ease of use, a few more compat options are available: * `--compat-options all`: Use all compat options (**Do NOT use this!**) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext,-prefer-vp9-sort` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` -* `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options +* `--compat-options 2023`: Same as `--compat-options prefer-vp9-sort`. Use this to enable all future compat options The following compat options restore vulnerable behavior from before security patches: diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index f08a31afac77..3186a999deb8 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -470,7 +470,7 @@ class YoutubeDL: The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort, no-clean-infojson, no-playlist-metafiles, - no-keep-subs, no-attach-info-json, allow-unsafe-ext. + no-keep-subs, no-attach-info-json, allow-unsafe-ext, prefer-vp9-sort. Refer __init__.py for their implementation progress_template: Dictionary of templates for progress outputs. Allowed keys are 'download', 'postprocess', diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 9b3bd4acd214..a7665159bd61 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -159,6 +159,9 @@ def set_default_compat(compat_name, opt_name, default=True, remove_compat=True): opts.embed_infojson = False if 'format-sort' in opts.compat_opts: opts.format_sort.extend(FormatSorter.ytdl_default) + elif 'prefer-vp9-sort' in opts.compat_opts: + opts.format_sort.extend(FormatSorter._prefer_vp9_sort) + _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) _audio_multistreams_set = set_default_compat('multistreams', 'allow_multiple_audio_streams', False, remove_compat=False) if _video_multistreams_set is False and _audio_multistreams_set is False: diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 99b8bfecc92f..88c032cdbac8 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -4777,7 +4777,7 @@ def is_bad_format(fmt): 'live_status': live_status, 'release_timestamp': live_start_time, '_format_sort_fields': ( # source_preference is lower for potentially damaged formats - 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto'), + 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang', 'proto'), } subtitles = {} @@ -7858,7 +7858,7 @@ def _real_extract(self, url): 'section_start': int(clip_data['startTimeMs']) / 1000, 'section_end': int(clip_data['endTimeMs']) / 1000, '_format_sort_fields': ( # https protocol is prioritized for ffmpeg compatibility - 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang'), + 'proto:https', 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec', 'channels', 'acodec', 'lang'), } diff --git a/yt_dlp/options.py b/yt_dlp/options.py index c4d2a7274326..8eb5f2a56483 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -483,13 +483,13 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'no-attach-info-json', 'embed-thumbnail-atomicparsley', 'no-external-downloader-progress', 'embed-metadata', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', - 'prefer-legacy-http-handler', 'manifest-filesize-approx', 'allow-unsafe-ext', + 'prefer-legacy-http-handler', 'manifest-filesize-approx', 'allow-unsafe-ext', 'prefer-vp9-sort', }, 'aliases': { - 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext'], - 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext'], + 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext', '-prefer-vp9-sort'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext', '-prefer-vp9-sort'], '2021': ['2022', 'no-certifi', 'filename-sanitization'], '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], - '2023': [], + '2023': ['prefer-vp9-sort'], }, }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 2f4c0a00f614..844818e38805 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5337,8 +5337,11 @@ class FormatSorter: regex = r' *((?P\+)?(?P[a-zA-Z0-9_]+)((?P[~:])(?P.*?))?)? *$' default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec', + 'res', 'fps', 'hdr:12', 'vcodec', 'channels', 'acodec', 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases + _prefer_vp9_sort = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', + 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec', + 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', 'height', 'width', 'proto', 'vext', 'abr', 'aext', 'fps', 'fs_approx', 'source', 'id') From beae2db127d3b5017cbcf685da9de7a9ef496541 Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 3 Nov 2024 21:03:09 +0100 Subject: [PATCH 18/30] [aes] Fix GCM pad length calculation (#11438) Closes #10169 Authored by: seproDev --- test/test_aes.py | 12 ++++++++++++ yt_dlp/aes.py | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/test/test_aes.py b/test/test_aes.py index 5f975efecfa4..6fe6059a17ad 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -83,6 +83,18 @@ def test_gcm_decrypt(self): data, intlist_to_bytes(self.key), authentication_tag, intlist_to_bytes(self.iv[:12])) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + def test_gcm_aligned_decrypt(self): + data = b'\x159Y\xcf5eud\x90\x9c\x85&]\x14\x1d\x0f' + authentication_tag = b'\x08\xb1\x9d!&\x98\xd0\xeaRq\x90\xe6;\xb5]\xd8' + + decrypted = intlist_to_bytes(aes_gcm_decrypt_and_verify( + list(data), self.key, list(authentication_tag), self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg[:16]) + if Cryptodome.AES: + decrypted = aes_gcm_decrypt_and_verify_bytes( + data, bytes(self.key), authentication_tag, bytes(self.iv[:12])) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg[:16]) + def test_decrypt_text(self): password = intlist_to_bytes(self.key).decode() encrypted = base64.b64encode( diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index abf54a998e0e..be67b40fe215 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -230,11 +230,11 @@ def aes_gcm_decrypt_and_verify(data, key, tag, nonce): iv_ctr = inc(j0) decrypted_data = aes_ctr_decrypt(data, key, iv_ctr + [0] * (BLOCK_SIZE_BYTES - len(iv_ctr))) - pad_len = len(data) // 16 * 16 + pad_len = (BLOCK_SIZE_BYTES - (len(data) % BLOCK_SIZE_BYTES)) % BLOCK_SIZE_BYTES s_tag = ghash( hash_subkey, data - + [0] * (BLOCK_SIZE_BYTES - len(data) + pad_len) # pad + + [0] * pad_len # pad + bytes_to_intlist((0 * 8).to_bytes(8, 'big') # length of associated data + ((len(data) * 8).to_bytes(8, 'big'))), # length of data ) From 754940e9a558565d6bd3c0c529802569b1d0ae4e Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 3 Nov 2024 21:19:35 +0100 Subject: [PATCH 19/30] [ie/bfmtv] Fix extractors (#11444) Authored by: seproDev --- yt_dlp/extractor/bfmtv.py | 66 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index 87f011783bfe..49d4819a3df4 100644 --- a/yt_dlp/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py @@ -1,18 +1,33 @@ import re from .common import InfoExtractor -from ..utils import extract_attributes +from ..utils import ExtractorError, extract_attributes class BFMTVBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/' _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P\d{12})\.html' - _VIDEO_BLOCK_REGEX = r'(]+class="video_block[^"]*"[^>]*>)' + _VIDEO_BLOCK_REGEX = r'(]+class="video_block[^"]*"[^>]*>.*?)' + _VIDEO_ELEMENT_REGEX = r'(]+>)' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' - def _brightcove_url_result(self, video_id, video_block): - account_id = video_block.get('accountid') or '876450612001' - player_id = video_block.get('playerid') or 'I2qBTln4u' + def _extract_video(self, video_block): + video_element = self._search_regex( + self._VIDEO_ELEMENT_REGEX, video_block, 'video element', default=None) + if video_element: + video_element_attrs = extract_attributes(video_element) + video_id = video_element_attrs.get('data-video-id') + if not video_id: + return + account_id = video_element_attrs.get('data-account') or '876450610001' + player_id = video_element_attrs.get('adjustplayer') or '19dszYXgm' + else: + video_block_attrs = extract_attributes(video_block) + video_id = video_block_attrs.get('videoid') + if not video_id: + return + account_id = video_block_attrs.get('accountid') or '876630703001' + player_id = video_block_attrs.get('playerid') or 'KbPwEbuHx' return self.url_result( self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), 'BrightcoveNew', video_id) @@ -40,23 +55,25 @@ class BFMTVIE(BFMTVBaseIE): def _real_extract(self, url): bfmtv_id = self._match_id(url) webpage = self._download_webpage(url, bfmtv_id) - video_block = extract_attributes(self._search_regex( + video = self._extract_video(self._search_regex( self._VIDEO_BLOCK_REGEX, webpage, 'video block')) - return self._brightcove_url_result(video_block['videoid'], video_block) + if not video: + raise ExtractorError('Failed to extract video') + return video -class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE +class BFMTVLiveIE(BFMTVBaseIE): IE_NAME = 'bfmtv:live' _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P(?:[^/]+/)?en-direct)' _TESTS = [{ 'url': 'https://www.bfmtv.com/en-direct/', 'info_dict': { - 'id': '5615950982001', + 'id': '6346069778112', 'ext': 'mp4', - 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^Le Live BFM TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'uploader_id': '876450610001', - 'upload_date': '20220926', - 'timestamp': 1664207191, + 'upload_date': '20240202', + 'timestamp': 1706887572, 'live_status': 'is_live', 'thumbnail': r're:https://.+/image\.jpg', 'tags': [], @@ -69,6 +86,15 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE 'only_matching': True, }] + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + video = self._extract_video(self._search_regex( + self._VIDEO_BLOCK_REGEX, webpage, 'video block')) + if not video: + raise ExtractorError('Failed to extract video') + return video + class BFMTVArticleIE(BFMTVBaseIE): IE_NAME = 'bfmtv:article' @@ -102,18 +128,16 @@ class BFMTVArticleIE(BFMTVBaseIE): }, }] + def _entries(self, webpage): + for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): + video = self._extract_video(video_block_el) + if video: + yield video + def _real_extract(self, url): bfmtv_id = self._match_id(url) webpage = self._download_webpage(url, bfmtv_id) - entries = [] - for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): - video_block = extract_attributes(video_block_el) - video_id = video_block.get('videoid') - if not video_id: - continue - entries.append(self._brightcove_url_result(video_id, video_block)) - return self.playlist_result( - entries, bfmtv_id, self._og_search_title(webpage, fatal=False), + self._entries(webpage), bfmtv_id, self._og_search_title(webpage, fatal=False), self._html_search_meta(['og:description', 'description'], webpage)) From a403dcf9be20b49cbb3017328f4aaa352fb6d685 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Mon, 4 Nov 2024 07:02:48 +0800 Subject: [PATCH 20/30] [ie/Dailymotion] Improve embed extraction (#10843) Closes #8848, Closes #9432 Authored by: pzhlkj6612, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/dailymotion.py | 115 ++++++++++++++++++++++++++++---- 1 file changed, 101 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/dailymotion.py b/yt_dlp/extractor/dailymotion.py index 632335e5b050..4e99fdda7a0f 100644 --- a/yt_dlp/extractor/dailymotion.py +++ b/yt_dlp/extractor/dailymotion.py @@ -10,11 +10,14 @@ OnDemandPagedList, age_restricted, clean_html, + extract_attributes, int_or_none, traverse_obj, try_get, unescapeHTML, unsmuggle_url, + update_url, + url_or_none, urlencode_postdata, ) @@ -99,11 +102,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor): _VALID_URL = r'''(?ix) https?:// (?: - (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:(?:embed|swf|\#)/)|player(?:/\w+)?\.html\?)?video|swf)| - (?:www\.)?lequipe\.fr/video + (?:(?:www|touch|geo)\.)?dailymotion\.[a-z]{2,3}| + (?:www\.)?lequipe\.fr + )/ + (?: + swf/(?!video)| + (?:(?:crawler|embed|swf)/)?video/| + player(?:/[\da-z]+)?\.html\?(?:video|(?Pplaylist))= ) - [/=](?P[^/?_&]+)(?:.+?\bplaylist=(?Px[0-9a-z]+))? - ''' + (?P[^/?_&#]+)(?:[\w-]*\?playlist=(?Px[0-9a-z]+))? + ''' IE_NAME = 'dailymotion' _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _TESTS = [{ @@ -217,6 +225,63 @@ class DailymotionIE(DailymotionBaseInfoExtractor): }, { 'url': 'https://geo.dailymotion.com/player/xakln.html?video=x8mjju4&customConfig%5BcustomParams%5D=%2Ffr-fr%2Ftennis%2Fwimbledon-mens-singles%2Farticles-video', 'only_matching': True, + }, { # playlist-only + 'url': 'https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj', + 'only_matching': True, + }, { + 'url': 'https://geo.dailymotion.com/player/xmyye.html?video=x93blhi', + 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/crawler/video/x8u4owg', + 'only_matching': True, + }, { + 'url': 'https://www.dailymotion.com/embed/video/x8u4owg', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # https://geo.dailymotion.com/player/xmyye.html?video=x93blhi + 'url': 'https://www.financialounge.com/video/2024/08/01/borse-europee-in-rosso-dopo-la-fed-a-milano-volano-mediobanca-e-tim-edizione-del-1-agosto/', + 'info_dict': { + 'id': 'x93blhi', + 'ext': 'mp4', + 'title': 'OnAir - 01/08/24', + 'description': '', + 'duration': 217, + 'timestamp': 1722505658, + 'upload_date': '20240801', + 'uploader': 'Financialounge', + 'uploader_id': 'x2vtgmm', + 'age_limit': 0, + 'tags': [], + 'view_count': int, + 'like_count': int, + }, + }, { + # https://geo.dailymotion.com/player/xf7zn.html?playlist=x7wdsj + 'url': 'https://www.cycleworld.com/blogs/ask-kevin/ducati-continues-to-evolve-with-v4/', + 'info_dict': { + 'id': 'x7wdsj', + }, + 'playlist_mincount': 50, + }, { + # https://www.dailymotion.com/crawler/video/x8u4owg + 'url': 'https://www.leparisien.fr/environnement/video-le-veloto-la-voiture-a-pedales-qui-aimerait-se-faire-une-place-sur-les-routes-09-03-2024-KCYMCPM4WFHJXMSKBUI66UNFPU.php', + 'info_dict': { + 'id': 'x8u4owg', + 'ext': 'mp4', + 'like_count': int, + 'uploader': 'Le Parisien', + 'thumbnail': 'https://www.leparisien.fr/resizer/ho_GwveeYftNkLwg_cEta--5Bv4=/1200x675/cloudfront-eu-central-1.images.arcpublishing.com/leparisien/BFXJNEBN75EUNHGYJLORUC3TX4.jpg', + 'upload_date': '20240309', + 'view_count': int, + 'timestamp': 1709997866, + 'age_limit': 0, + 'uploader_id': 'x32f7b', + 'title': 'VIDÉO. Le «\xa0véloto\xa0», la voiture à pédales qui aimerait se faire une place sur les routes', + 'duration': 428.0, + 'description': 'À bord du « véloto », l’alternative à la voiture pour la campagne', + 'tags': ['biclou', 'vélo', 'véloto', 'campagne', 'voiture', 'environnement', 'véhicules intermédiaires'], + }, }] _GEO_BYPASS = False _COMMON_MEDIA_FIELDS = '''description @@ -232,16 +297,35 @@ def _extract_embed_urls(cls, url, webpage): for mobj in re.finditer( r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P[0-9a-zA-Z]+).+?}\s*\);', webpage): yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') + for mobj in re.finditer( + r'(?s)