From 773bbb181506856ffda95496ab60c1c9603f1f71 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 2 Jul 2024 16:17:06 -0500 Subject: [PATCH 01/95] [core] Fix `--compat-opt allow-unsafe-ext` (#10336) Fixes bug in 5ce582448ececb8d9c30c8c31f58330090ced03a Authored by: bashonly, rdamas Co-authored-by: Robert Damas --- yt_dlp/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index f88f15d70..0e48569e3 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -599,7 +599,7 @@ def report_deprecation(val, old, new=None): warnings.append( 'Using allow-unsafe-ext opens you up to potential attacks. ' 'Use with great care!') - _UnsafeExtensionError.sanitize_extension = lambda x: x + _UnsafeExtensionError.sanitize_extension = lambda x, prepend=False: x return warnings, deprecation_warnings From d502f4c6d95b74896f40070d07229997f0850f31 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 2 Jul 2024 16:24:17 -0500 Subject: [PATCH 02/95] [pp/embedthumbnail] Fix embedding with mutagen (#10337) Fixes regression in f2a4ea1794718e4dc0148bc172cb877f1080903b Closes #10335 Authored by: bashonly --- yt_dlp/postprocessor/embedthumbnail.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/postprocessor/embedthumbnail.py b/yt_dlp/postprocessor/embedthumbnail.py index f2228ac61..16c8bcdda 100644 --- a/yt_dlp/postprocessor/embedthumbnail.py +++ b/yt_dlp/postprocessor/embedthumbnail.py @@ -134,7 +134,7 @@ def run(self, info): meta = MP4(filename) # NOTE: the 'covr' atom is a non-standard MPEG-4 atom, # Apple iTunes 'M4A' files include the 'moov.udta.meta.ilst' atom. - meta.tags['covr'] = [MP4Cover(data=thumb_data, imageformat=f)] + meta.tags['covr'] = [MP4Cover(data=thumb_data, imageformat=f[type_])] meta.save() temp_filename = filename except Exception as err: From 6403530e2dfe259a87afe444708c4f3024cc45b8 Mon Sep 17 00:00:00 2001 From: DrakoCpp <160542400+DrakoCpp@users.noreply.github.com> Date: Tue, 2 Jul 2024 23:49:09 +0200 Subject: [PATCH 03/95] [ie/murrtube] Fix extractor (#9249) Closes #7500 Authored by: DrakoCpp --- yt_dlp/extractor/murrtube.py | 157 +++++++++++++++++------------------ 1 file changed, 77 insertions(+), 80 deletions(-) diff --git a/yt_dlp/extractor/murrtube.py b/yt_dlp/extractor/murrtube.py index 3b39a1b9a..9067b8781 100644 --- a/yt_dlp/extractor/murrtube.py +++ b/yt_dlp/extractor/murrtube.py @@ -5,39 +5,103 @@ from ..utils import ( ExtractorError, OnDemandPagedList, - determine_ext, - int_or_none, - try_get, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + parse_count, + remove_end, + update_url, + urlencode_postdata, ) class MurrtubeIE(InfoExtractor): - _WORKING = False _VALID_URL = r'''(?x) (?: murrtube:| - https?://murrtube\.net/videos/(?P[a-z0-9\-]+)\- + https?://murrtube\.net/(?:v/|videos/(?P[a-z0-9-]+?)-) ) - (?P[a-f0-9]{8}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{4}\-[a-f0-9]{12}) + (?P[A-Z0-9]{4}|[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}) ''' - _TEST = { + _TESTS = [{ 'url': 'https://murrtube.net/videos/inferno-x-skyler-148b6f2a-fdcc-4902-affe-9c0f41aaaca0', - 'md5': '169f494812d9a90914b42978e73aa690', + 'md5': '70380878a77e8565d4aea7f68b8bbb35', 'info_dict': { - 'id': '148b6f2a-fdcc-4902-affe-9c0f41aaaca0', + 'id': 'ca885d8456b95de529b6723b158032e11115d', 'ext': 'mp4', 'title': 'Inferno X Skyler', 'description': 'Humping a very good slutty sheppy (roomate)', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 284, 'uploader': 'Inferno Wolf', 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/ekbs3zcfvuynnqfx72nn2tkokvsd', 'comment_count': int, 'view_count': int, 'like_count': int, - 'tags': ['hump', 'breed', 'Fursuit', 'murrsuit', 'bareback'], }, - } + }, { + 'url': 'https://murrtube.net/v/0J2Q', + 'md5': '31262f6ac56f0ca75e5a54a0f3fefcb6', + 'info_dict': { + 'id': '8442998c52134968d9caa36e473e1a6bac6ca', + 'ext': 'mp4', + 'uploader': 'Hayel', + 'title': 'Who\'s in charge now?', + 'description': 'md5:795791e97e5b0f1805ea84573f02a997', + 'age_limit': 18, + 'thumbnail': 'https://storage.murrtube.net/murrtube-production/fb1ojjwiucufp34ya6hxu5vfqi5s', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + }, + }] + + def _extract_count(self, name, html): + return parse_count(self._search_regex( + rf'([\d,]+)\s+]*>{name}', html, name, default=None)) + + def _real_initialize(self): + homepage = self._download_webpage( + 'https://murrtube.net', None, note='Getting session token') + self._request_webpage( + 'https://murrtube.net/accept_age_check', None, 'Setting age cookie', + data=urlencode_postdata(self._hidden_inputs(homepage))) + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id.startswith('murrtube:'): + raise ExtractorError('Support for murrtube: prefix URLs is broken') + video_page = self._download_webpage(url, video_id) + video_attrs = extract_attributes(get_element_html_by_id('video', video_page)) + playlist = update_url(video_attrs['data-url'], query=None) + video_id = self._search_regex(r'/([\da-f]+)/index.m3u8', playlist, 'video id') + + return { + 'id': video_id, + 'title': remove_end(self._og_search_title(video_page), ' - Murrtube'), + 'age_limit': 18, + 'formats': self._extract_m3u8_formats(playlist, video_id, 'mp4'), + 'description': self._og_search_description(video_page), + 'thumbnail': update_url(self._og_search_thumbnail(video_page, default=''), query=None) or None, + 'uploader': clean_html(get_element_by_class('pl-1 is-size-6 has-text-lighter', video_page)), + 'view_count': self._extract_count('Views', video_page), + 'like_count': self._extract_count('Likes', video_page), + 'comment_count': self._extract_count('Comments', video_page), + } + + +class MurrtubeUserIE(InfoExtractor): + _WORKING = False + IE_DESC = 'Murrtube user profile' + _VALID_URL = r'https?://murrtube\.net/(?P[^/]+)$' + _TESTS = [{ + 'url': 'https://murrtube.net/stormy', + 'info_dict': { + 'id': 'stormy', + }, + 'playlist_mincount': 27, + }] + _PAGE_SIZE = 10 def _download_gql(self, video_id, op, note=None, fatal=True): result = self._download_json( @@ -46,73 +110,6 @@ def _download_gql(self, video_id, op, note=None, fatal=True): headers={'Content-Type': 'application/json'}) return result['data'] - def _real_extract(self, url): - video_id = self._match_id(url) - data = self._download_gql(video_id, { - 'operationName': 'Medium', - 'variables': { - 'id': video_id, - }, - 'query': '''\ -query Medium($id: ID!) { - medium(id: $id) { - title - description - key - duration - commentsCount - likesCount - viewsCount - thumbnailKey - tagList - user { - name - __typename - } - __typename - } -}'''}) - meta = data['medium'] - - storage_url = 'https://storage.murrtube.net/murrtube/' - format_url = storage_url + meta.get('key', '') - thumbnail = storage_url + meta.get('thumbnailKey', '') - - if determine_ext(format_url) == 'm3u8': - formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', fatal=False) - else: - formats = [{'url': format_url}] - - return { - 'id': video_id, - 'title': meta.get('title'), - 'description': meta.get('description'), - 'formats': formats, - 'thumbnail': thumbnail, - 'duration': int_or_none(meta.get('duration')), - 'uploader': try_get(meta, lambda x: x['user']['name']), - 'view_count': meta.get('viewsCount'), - 'like_count': meta.get('likesCount'), - 'comment_count': meta.get('commentsCount'), - 'tags': meta.get('tagList'), - 'age_limit': 18, - } - - -class MurrtubeUserIE(MurrtubeIE): # XXX: Do not subclass from concrete IE - _WORKING = False - IE_DESC = 'Murrtube user profile' - _VALID_URL = r'https?://murrtube\.net/(?P[^/]+)$' - _TEST = { - 'url': 'https://murrtube.net/stormy', - 'info_dict': { - 'id': 'stormy', - }, - 'playlist_mincount': 27, - } - _PAGE_SIZE = 10 - def _fetch_page(self, username, user_id, page): data = self._download_gql(username, { 'operationName': 'Media', From 7509791385ba88cb7ec0ab17e826681f4af4b66e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Mi=C5=9B?= Date: Tue, 2 Jul 2024 23:51:07 +0200 Subject: [PATCH 04/95] [ie/banbye] Fix extractor (#10332) Closes #8584 Authored by: PatrykMis, seproDev Co-authored-by: sepro <4618135+seproDev@users.noreply.github.com> --- yt_dlp/extractor/banbye.py | 71 +++++++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index d10bdf8da..148a1705e 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -4,9 +4,13 @@ from .common import InfoExtractor from ..utils import ( InAdvancePagedList, + determine_ext, format_field, + int_or_none, + join_nonempty, traverse_obj, unified_timestamp, + url_or_none, ) @@ -30,6 +34,7 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): _VALID_URL = r'https?://(?:www\.)?banbye\.com/(?:en/)?watch/(?P[\w-]+)' _TESTS = [{ + # ['src']['mp4']['levels'] direct mp4 urls only 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', 'info_dict': { @@ -58,6 +63,7 @@ class BanByeIE(BanByeBaseIE): }, 'playlist_mincount': 9, }, { + # ['src']['mp4']['levels'] direct mp4 urls only 'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD', 'info_dict': { 'id': 'v_kb6_o1Kyq-CD', @@ -77,6 +83,48 @@ class BanByeIE(BanByeBaseIE): 'view_count': int, 'comment_count': int, }, + }, { + # ['src']['hls']['levels'] variant m3u8 urls only; master m3u8 is 404 + 'url': 'https://banbye.com/watch/v_a_gPFuC9LoW5', + 'info_dict': { + 'id': 'v_a_gPFuC9LoW5', + 'ext': 'mp4', + 'title': 'md5:183524056bebdfa245fd6d214f63c0fe', + 'description': 'md5:943ac87287ca98d28d8b8797719827c6', + 'uploader': 'wRealu24', + 'channel_id': 'ch_wrealu24', + 'channel_url': 'https://banbye.com/channel/ch_wrealu24', + 'upload_date': '20231113', + 'timestamp': 1699874062, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.banbye.com/video/v_a_gPFuC9LoW5/96.webp', + 'tags': ['jaszczur', 'sejm', 'lewica', 'polska', 'ukrainizacja', 'pierwszeposiedzeniesejmu'], + }, + 'expected_warnings': ['Failed to download m3u8'], + }, { + # ['src']['hls']['masterPlaylist'] m3u8 only + 'url': 'https://banbye.com/watch/v_B0rsKWsr-aaa', + 'info_dict': { + 'id': 'v_B0rsKWsr-aaa', + 'ext': 'mp4', + 'title': 'md5:00b254164b82101b3f9e5326037447ed', + 'description': 'md5:3fd8b48aa81954ba024bc60f5de6e167', + 'uploader': 'PSTV Piotr Szlachtowicz ', + 'channel_id': 'ch_KV9EVObkB9wB', + 'channel_url': 'https://banbye.com/channel/ch_KV9EVObkB9wB', + 'upload_date': '20240629', + 'timestamp': 1719646816, + 'duration': 2377, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.banbye.com/video/v_B0rsKWsr-aaa/96.webp', + 'tags': ['Biden', 'Trump', 'Wybory', 'USA'], + }, }] def _real_extract(self, url): @@ -91,11 +139,24 @@ def _real_extract(self, url): 'id': f'{quality}p', 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.webp', } for quality in [48, 96, 144, 240, 512, 1080]] - formats = [{ - 'format_id': f'http-{quality}p', - 'quality': quality, - 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', - } for quality in data['quality']] + + formats = [] + url_data = self._download_json(f'{self._API_BASE}/videos/{video_id}/url', video_id, data=b'') + if master_url := traverse_obj(url_data, ('src', 'hls', 'masterPlaylist', {url_or_none})): + formats = self._extract_m3u8_formats(master_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + for format_id, format_url in traverse_obj(url_data, ( + 'src', ('mp4', 'hls'), 'levels', {dict.items}, lambda _, v: url_or_none(v[1]))): + ext = determine_ext(format_url) + is_hls = ext == 'm3u8' + formats.append({ + 'url': format_url, + 'ext': 'mp4' if is_hls else ext, + 'format_id': join_nonempty(is_hls and 'hls', format_id), + 'protocol': 'm3u8_native' if is_hls else 'https', + 'height': int_or_none(format_id), + }) + self._remove_duplicate_formats(formats) return { 'id': video_id, From 7799e518956387bb3c1064c9beae26eab8d5044a Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Tue, 2 Jul 2024 22:22:52 +0000 Subject: [PATCH 05/95] [ie/zaiko] Support JWT video URLs (#10130) Closes #9798 Authored by: pzhlkj6612 --- yt_dlp/extractor/zaiko.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index c8c4ec0b8..4563b7ba0 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -66,7 +66,9 @@ def _real_extract(self, url): stream_meta['stream-access']['video_source'], video_id, 'Downloading player page', headers={'referer': 'https://zaiko.io/'}) player_meta = self._parse_vue_element_attr('player', player_page, video_id) - status = traverse_obj(player_meta, ('initial_event_info', 'status', {str})) + initial_event_info = traverse_obj(player_meta, ('initial_event_info', {dict})) or {} + + status = traverse_obj(initial_event_info, ('status', {str})) live_status, msg, expected = { 'vod': ('was_live', 'No VOD stream URL was found', False), 'archiving': ('post_live', 'Event VOD is still being processed', True), @@ -80,14 +82,20 @@ def _real_extract(self, url): 'cancelled': ('not_live', 'Event has been cancelled', True), }.get(status) or ('not_live', f'Unknown event status "{status}"', False) - stream_url = traverse_obj(player_meta, ('initial_event_info', 'endpoint', {url_or_none})) + if traverse_obj(initial_event_info, ('is_jwt_protected', {bool})): + stream_url = self._download_json( + initial_event_info['jwt_token_url'], video_id, 'Downloading JWT-protected stream URL', + 'Failed to download JWT-protected stream URL')['playback_url'] + else: + stream_url = traverse_obj(initial_event_info, ('endpoint', {url_or_none})) + formats = self._extract_m3u8_formats( stream_url, video_id, live=True, fatal=False) if stream_url else [] if not formats: self.raise_no_formats(msg, expected=expected) thumbnail_urls = [ - traverse_obj(player_meta, ('initial_event_info', 'poster_url')), + traverse_obj(initial_event_info, ('poster_url', {url_or_none})), self._og_search_thumbnail(self._download_webpage( f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''), ] @@ -103,9 +111,7 @@ def _real_extract(self, url): 'release_timestamp': ('stream', 'start', 'timestamp', {int_or_none}), 'categories': ('event', 'genres', ..., {lambda x: x or None}), }), - **traverse_obj(player_meta, ('initial_event_info', { - 'alt_title': ('title', {str}), - })), + 'alt_title': traverse_obj(initial_event_info, ('title', {str})), 'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)], } From 93d33cb29af9e2e84369ac43589d50ce8e0160ef Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 2 Jul 2024 18:03:08 -0500 Subject: [PATCH 06/95] [cleanup] Misc (#10330) Authored by: bashonly --- .gitignore | 2 +- Changelog.md | 2 +- Makefile | 2 +- README.md | 6 +++--- devscripts/changelog_override.json | 2 +- yt_dlp/options.py | 4 ++-- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index db322c4f0..fdd904f7f 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ cookies *.srt *.ssa *.swf -*.swp *.tt *.ttml *.url @@ -119,6 +118,7 @@ yt-dlp.zip .vscode *.sublime-* *.code-workspace +*.swp # Lazy extractors */extractor/lazy_extractors.py diff --git a/Changelog.md b/Changelog.md index 3dbbc210c..64a0c47fb 100644 --- a/Changelog.md +++ b/Changelog.md @@ -7,7 +7,7 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/wor ### 2024.07.01 #### Important changes -- Security: [[CVE-2024-10123](https://nvd.nist.gov/vuln/detail/CVE-2024-10123)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j) +- Security: [[CVE-2024-38519](https://nvd.nist.gov/vuln/detail/CVE-2024-38519)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j) - Unsafe extensions are now blocked from being downloaded #### Core changes diff --git a/Makefile b/Makefile index e1de7f3e9..6c72ead1e 100644 --- a/Makefile +++ b/Makefile @@ -21,7 +21,7 @@ clean-test: rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ *.frag.aria2 *.frag.urls *.info.json *.live_chat.json *.meta *.part* *.tmp *.temp *.unknown_video *.ytdl \ *.3gp *.ape *.ass *.avi *.desktop *.f4v *.flac *.flv *.gif *.jpeg *.jpg *.lrc *.m4a *.m4v *.mhtml *.mkv *.mov *.mp3 *.mp4 \ - *.mpg *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.ssa *.swf *.swp *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp + *.mpg *.mpga *.oga *.ogg *.opus *.png *.sbv *.srt *.ssa *.swf *.tt *.ttml *.url *.vtt *.wav *.webloc *.webm *.webp clean-dist: rm -rf yt-dlp.1.temp.md yt-dlp.1 README.txt MANIFEST build/ dist/ .coverage cover/ yt-dlp.tar.gz completions/ \ yt_dlp/extractor/lazy_extractors.py *.spec CONTRIBUTING.md.tmp yt-dlp yt-dlp.exe yt_dlp.egg-info/ AUTHORS diff --git a/README.md b/README.md index e8aeb93f7..836e084e6 100644 --- a/README.md +++ b/README.md @@ -2222,9 +2222,9 @@ ### Differences in default behavior For ease of use, a few more compat options are available: -* `--compat-options all`: Use all compat options (Do NOT use) -* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx` -* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx` +* `--compat-options all`: Use all compat options (**Do NOT use this!**) +* `--compat-options youtube-dl`: Same as `--compat-options all,-multistreams,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` +* `--compat-options youtube-dlc`: Same as `--compat-options all,-no-live-chat,-no-youtube-channel-redirect,-playlist-match-filter,-manifest-filesize-approx,-allow-unsafe-ext` * `--compat-options 2021`: Same as `--compat-options 2022,no-certifi,filename-sanitization,no-youtube-prefer-utc-upload-date` * `--compat-options 2022`: Same as `--compat-options 2023,playlist-match-filter,no-external-downloader-progress,prefer-legacy-http-handler,manifest-filesize-approx` * `--compat-options 2023`: Currently does nothing. Use this to enable all future compat options diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index ced38a0dd..ab42f5549 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -179,6 +179,6 @@ { "action": "add", "when": "6aaf96a3d6e7d0d426e97e11a2fcf52fda00e733", - "short": "[priority] Security: [[CVE-2024-10123](https://nvd.nist.gov/vuln/detail/CVE-2024-10123)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j)\n - Unsafe extensions are now blocked from being downloaded" + "short": "[priority] Security: [[CVE-2024-38519](https://nvd.nist.gov/vuln/detail/CVE-2024-38519)] [Properly sanitize file-extension to prevent file system modification and RCE](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-79w7-vh3h-8g4j)\n - Unsafe extensions are now blocked from being downloaded" } ] diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 1b18575c1..76db06c85 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -476,8 +476,8 @@ def _alias_callback(option, opt_str, value, parser, opts, nargs): 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', 'prefer-legacy-http-handler', 'manifest-filesize-approx', 'allow-unsafe-ext', }, 'aliases': { - 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx'], - 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx'], + 'youtube-dl': ['all', '-multistreams', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat', '-playlist-match-filter', '-manifest-filesize-approx', '-allow-unsafe-ext'], '2021': ['2022', 'no-certifi', 'filename-sanitization'], '2022': ['2023', 'no-external-downloader-progress', 'playlist-match-filter', 'prefer-legacy-http-handler', 'manifest-filesize-approx'], '2023': [], From d28aa87e215991023a0b2ea6fae0e000f283dcd1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 23:13:48 +0000 Subject: [PATCH 07/95] Release 2024.07.02 Created by: bashonly :ci skip all :ci run dl --- CONTRIBUTORS | 2 ++ Changelog.md | 16 ++++++++++++++++ supportedsites.md | 2 +- yt_dlp/version.py | 6 +++--- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index a89357275..7d0c5bdb8 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -644,3 +644,5 @@ peisenwang TheZ3ro tippfehlr varunchopra +DrakoCpp +PatrykMis diff --git a/Changelog.md b/Changelog.md index 64a0c47fb..b1eb6e367 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,22 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.07.02 + +#### Core changes +- [Fix `--compat-opt allow-unsafe-ext`](https://github.com/yt-dlp/yt-dlp/commit/773bbb181506856ffda95496ab60c1c9603f1f71) ([#10336](https://github.com/yt-dlp/yt-dlp/issues/10336)) by [bashonly](https://github.com/bashonly), [rdamas](https://github.com/rdamas) + +#### Extractor changes +- **banbye**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7509791385ba88cb7ec0ab17e826681f4af4b66e) ([#10332](https://github.com/yt-dlp/yt-dlp/issues/10332)) by [PatrykMis](https://github.com/PatrykMis), [seproDev](https://github.com/seproDev) +- **murrtube**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6403530e2dfe259a87afe444708c4f3024cc45b8) ([#9249](https://github.com/yt-dlp/yt-dlp/issues/9249)) by [DrakoCpp](https://github.com/DrakoCpp) +- **zaiko**: [Support JWT video URLs](https://github.com/yt-dlp/yt-dlp/commit/7799e518956387bb3c1064c9beae26eab8d5044a) ([#10130](https://github.com/yt-dlp/yt-dlp/issues/10130)) by [pzhlkj6612](https://github.com/pzhlkj6612) + +#### Postprocessor changes +- **embedthumbnail**: [Fix embedding with mutagen](https://github.com/yt-dlp/yt-dlp/commit/d502f4c6d95b74896f40070d07229997f0850f31) ([#10337](https://github.com/yt-dlp/yt-dlp/issues/10337)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [93d33cb](https://github.com/yt-dlp/yt-dlp/commit/93d33cb29af9e2e84369ac43589d50ce8e0160ef) by [bashonly](https://github.com/bashonly) + ### 2024.07.01 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 656366b4a..15fc496b5 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -839,7 +839,7 @@ # Supported sites - **MTVUutisetArticle**: (**Currently broken**) - **MuenchenTV**: münchen.tv (**Currently broken**) - **MujRozhlas** - - **Murrtube**: (**Currently broken**) + - **Murrtube** - **MurrtubeUser**: Murrtube user profile (**Currently broken**) - **MuseAI** - **MuseScore** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 6e8fd3ae8..7581a3b21 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.07.01' +__version__ = '2024.07.02' -RELEASE_GIT_HEAD = '5ce582448ececb8d9c30c8c31f58330090ced03a' +RELEASE_GIT_HEAD = '93d33cb29af9e2e84369ac43589d50ce8e0160ef' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.07.01' +_pkg_version = '2024.07.02' From cc767e9490056efaaa11c186b0d032e4b4969180 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 3 Jul 2024 11:46:01 -0500 Subject: [PATCH 08/95] [core] Fix `--ignore-no-formats-error` (#10345) Fixes regression in 5ce582448ececb8d9c30c8c31f58330090ced03a Closes #10344 Authored by: Grub4K Co-authored-by: Simon Sawicki --- yt_dlp/utils/_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b5e1e2950..e00c75f6a 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5120,6 +5120,9 @@ def __init__(self, extension, /): @classmethod def sanitize_extension(cls, extension, /, *, prepend=False): + if extension is None: + return None + if '/' in extension or '\\' in extension: raise cls(extension) From 6075a029dba70a89675ae1250e7cdfd91f0eba41 Mon Sep 17 00:00:00 2001 From: Thomas Gerbet Date: Thu, 4 Jul 2024 00:35:24 +0200 Subject: [PATCH 09/95] [ie/douyutv] Do not use dangerous javascript source/URL (#10347) Ref: https://sansec.io/research/polyfill-supply-chain-attack Authored by: LeSuisse --- yt_dlp/extractor/douyutv.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index fdf19c252..e36eac919 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -24,8 +24,9 @@ class DouyuBaseIE(InfoExtractor): def _download_cryptojs_md5(self, video_id): for url in [ + # XXX: Do NOT use cdn.bootcdn.net; ref: https://sansec.io/research/polyfill-supply-chain-attack 'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js', - 'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js', + 'https://unpkg.com/cryptojslib@3.1.2/rollups/md5.js', ]: js_code = self._download_webpage( url, video_id, note='Downloading signing dependency', fatal=False) @@ -35,7 +36,8 @@ def _download_cryptojs_md5(self, video_id): raise ExtractorError('Unable to download JS dependency (crypto-js/md5)') def _get_cryptojs_md5(self, video_id): - return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id) + return self.cache.load( + 'douyu', 'crypto-js-md5', min_ver='2024.07.04') or self._download_cryptojs_md5(video_id) def _calc_sign(self, sign_func, video_id, a): b = uuid.uuid4().hex From c1c9bb4adb42d0d93a2fb5d93a7de0a87b6ba884 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 5 Jul 2024 13:32:53 -0500 Subject: [PATCH 10/95] [ie/vimeo] Fix password-protected video extraction (#10341) Closes #6603 Authored by: bashonly --- yt_dlp/extractor/vimeo.py | 87 ++++++++++++++++++++------------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index a4ab7e24a..18eb08444 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -1,6 +1,7 @@ import base64 import functools import itertools +import json import re import urllib.parse @@ -14,6 +15,7 @@ determine_ext, get_element_by_class, int_or_none, + join_nonempty, js_to_json, merge_dicts, parse_filesize, @@ -84,29 +86,23 @@ def _get_video_password(self): expected=True) return password - def _verify_video_password(self, url, video_id, password, token, vuid): - if url.startswith('http://'): - # vimeo only supports https now, but the user can give an http url - url = url.replace('http://', 'https://') - self._set_vimeo_cookie('vuid', vuid) - return self._download_webpage( - url + '/password', video_id, 'Verifying the password', - 'Wrong password', data=urlencode_postdata({ - 'password': password, - 'token': token, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': url, - }) - - def _extract_xsrft_and_vuid(self, webpage): - xsrft = self._search_regex( - r'(?:(?P["\'])xsrft(?P=q1)\s*:|xsrft\s*[=:])\s*(?P["\'])(?P.+?)(?P=q)', - webpage, 'login token', group='xsrft') - vuid = self._search_regex( - r'["\']vuid["\']\s*:\s*(["\'])(?P.+?)\1', - webpage, 'vuid', group='vuid') - return xsrft, vuid + def _verify_video_password(self, video_id, password, token): + url = f'https://vimeo.com/{video_id}' + try: + return self._download_webpage( + f'{url}/password', video_id, + 'Submitting video password', data=json.dumps({ + 'password': password, + 'token': token, + }, separators=(',', ':')).encode(), headers={ + 'Accept': '*/*', + 'Content-Type': 'application/json', + 'Referer': url, + }, impersonate=True) + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 418: + raise ExtractorError('Wrong password', expected=True) + raise def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): vimeo_config = self._search_regex( @@ -745,21 +741,34 @@ def _verify_player_video_password(self, url, video_id, headers): raise ExtractorError('Wrong video password', expected=True) return checked - def _extract_from_api(self, video_id, unlisted_hash=None): - token = self._download_json( - 'https://vimeo.com/_rv/jwt', video_id, headers={ - 'X-Requested-With': 'XMLHttpRequest', - })['token'] - api_url = 'https://api.vimeo.com/videos/' + video_id - if unlisted_hash: - api_url += ':' + unlisted_hash - video = self._download_json( - api_url, video_id, headers={ - 'Authorization': 'jwt ' + token, + def _call_videos_api(self, video_id, jwt_token, unlisted_hash=None): + return self._download_json( + join_nonempty(f'https://api.vimeo.com/videos/{video_id}', unlisted_hash, delim=':'), + video_id, 'Downloading API JSON', headers={ + 'Authorization': f'jwt {jwt_token}', 'Accept': 'application/json', }, query={ 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays', }) + + def _extract_from_api(self, video_id, unlisted_hash=None): + viewer = self._download_json( + 'https://vimeo.com/_next/viewer', video_id, 'Downloading viewer info') + + for retry in (False, True): + try: + video = self._call_videos_api(video_id, viewer['jwt'], unlisted_hash) + except ExtractorError as e: + if (not retry and isinstance(e.cause, HTTPError) and e.cause.status == 400 + and 'password' in traverse_obj( + e.cause.response.read(), + ({bytes.decode}, {json.loads}, 'invalid_parameters', ..., 'field'), + )): + self._verify_video_password( + video_id, self._get_video_password(), viewer['xsrft']) + continue + raise + info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) @@ -865,12 +874,6 @@ def _real_extract(self, url): redirect_url, video_id, headers) return self._parse_config(config, video_id) - if re.search(r']+?id="pw_form"', webpage): - video_password = self._get_video_password() - token, vuid = self._extract_xsrft_and_vuid(webpage) - webpage = self._verify_video_password( - redirect_url, video_id, video_password, token, vuid) - vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None) if vimeo_config: seed_status = vimeo_config.get('seed_status') or {} @@ -1290,9 +1293,7 @@ def _real_extract(self, url): video_password = self._get_video_password() viewer = self._download_json( 'https://vimeo.com/_rv/viewer', video_id) - webpage = self._verify_video_password( - 'https://vimeo.com/' + video_id, video_id, - video_password, viewer['xsrft'], viewer['vuid']) + webpage = self._verify_video_password(video_id, video_password, viewer['xsrft']) clip_page_config = self._parse_json(self._search_regex( r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', webpage, 'clip page config'), video_id) From 2a1a1b8e67e864289ac7ba5d05ec63dbb19a639f Mon Sep 17 00:00:00 2001 From: middlingphys <38708390+middlingphys@users.noreply.github.com> Date: Sat, 6 Jul 2024 07:31:16 +0900 Subject: [PATCH 11/95] [ie/abematv] Extract availability (#10348) Authored by: middlingphys --- yt_dlp/extractor/abematv.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 293a6c40e..9471df1da 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -368,6 +368,7 @@ def _real_extract(self, url): info['episode_number'] = epis if epis < 2000 else None is_live, m3u8_url = False, None + availability = 'public' if video_type == 'now-on-air': is_live = True channel_url = 'https://api.abema.io/v1/channels' @@ -389,6 +390,7 @@ def _real_extract(self, url): if 3 not in ondemand_types: # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') + availability = 'premium_only' info.update(traverse_obj(api_response, { 'series': ('series', 'title'), 'season': ('season', 'name'), @@ -408,6 +410,7 @@ def _real_extract(self, url): headers=headers) if not traverse_obj(api_response, ('slot', 'flags', 'timeshiftFree'), default=False): self.report_warning('This is a premium-only stream') + availability = 'premium_only' m3u8_url = f'https://vod-abematv.akamaized.net/slot/{video_id}/playlist.m3u8' else: @@ -425,6 +428,7 @@ def _real_extract(self, url): 'description': description, 'formats': formats, 'is_live': is_live, + 'availability': availability, }) return info From 00766ece0c5c7a80781a4ff677198c5fb69d9dc0 Mon Sep 17 00:00:00 2001 From: Sean Ellingham Date: Sat, 6 Jul 2024 00:02:35 +0100 Subject: [PATCH 12/95] [ie/vidyard] Add extractor (#10155) Closes #4618 Authored by: exterrestris --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/cellebrite.py | 69 +++--- yt_dlp/extractor/swearnet.py | 64 ++--- yt_dlp/extractor/vidyard.py | 426 ++++++++++++++++++++++++++++++++ 4 files changed, 470 insertions(+), 90 deletions(-) create mode 100644 yt_dlp/extractor/vidyard.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 7f6507def..34dea79ef 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2324,6 +2324,7 @@ ) from .vidlii import VidLiiIE from .vidly import VidlyIE +from .vidyard import VidyardIE from .viewlift import ( ViewLiftEmbedIE, ViewLiftIE, diff --git a/yt_dlp/extractor/cellebrite.py b/yt_dlp/extractor/cellebrite.py index e90365a8b..54367c4d5 100644 --- a/yt_dlp/extractor/cellebrite.py +++ b/yt_dlp/extractor/cellebrite.py @@ -1,63 +1,50 @@ -from .common import InfoExtractor -from ..utils import traverse_obj +from .vidyard import VidyardBaseIE, VidyardIE +from ..utils import ExtractorError, make_archive_id, url_basename -class CellebriteIE(InfoExtractor): +class CellebriteIE(VidyardBaseIE): _VALID_URL = r'https?://cellebrite\.com/(?:\w+)?/(?P[\w-]+)' _TESTS = [{ 'url': 'https://cellebrite.com/en/collect-data-from-android-devices-with-cellebrite-ufed/', 'info_dict': { - 'id': '16025876', + 'id': 'ZqmUss3dQfEMGpauambPuH', + 'display_id': '16025876', 'ext': 'mp4', - 'description': 'md5:174571cb97083fd1d457d75c684f4e2b', - 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', 'title': 'Ask the Expert: Chat Capture - Collect Data from Android Devices in Cellebrite UFED', - 'duration': 455, - 'tags': [], + 'description': 'md5:dee48fe12bbae5c01fe6a053f7676da4', + 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', + 'duration': 455.979, + '_old_archive_ids': ['cellebrite 16025876'], }, }, { 'url': 'https://cellebrite.com/en/how-to-lawfully-collect-the-maximum-amount-of-data-from-android-devices/', 'info_dict': { - 'id': '29018255', + 'id': 'QV1U8a2yzcxigw7VFnqKyg', + 'display_id': '29018255', 'ext': 'mp4', - 'duration': 134, - 'tags': [], - 'description': 'md5:e9a3d124c7287b0b07bad2547061cacf', + 'title': 'How to Lawfully Collect the Maximum Amount of Data From Android Devices', + 'description': 'md5:0e943a9ac14c374d5d74faed634d773c', 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2022/07/How-to-Lawfully-Collect-the-Maximum-Amount-of-Data-From-Android-Devices.png', - 'title': 'Android Extractions Explained', + 'duration': 134.315, + '_old_archive_ids': ['cellebrite 29018255'], }, }] - def _get_formats_and_subtitles(self, json_data, display_id): - formats = [{'url': url} for url in traverse_obj(json_data, ('mp4', ..., 'url')) or []] - subtitles = {} - - for url in traverse_obj(json_data, ('hls', ..., 'url')) or []: - fmt, sub = self._extract_m3u8_formats_and_subtitles( - url, display_id, ext='mp4', headers={'Referer': 'https://play.vidyard.com/'}) - formats.extend(fmt) - self._merge_subtitles(sub, target=subtitles) - - return formats, subtitles - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + slug = self._match_id(url) + webpage = self._download_webpage(url, slug) + vidyard_url = next(VidyardIE._extract_embed_urls(url, webpage), None) + if not vidyard_url: + raise ExtractorError('No Vidyard video embeds found on page') - player_uuid = self._search_regex( - r']*\bdata-uuid\s*=\s*"([^"\?]+)', webpage, 'player UUID') - json_data = self._download_json( - f'https://play.vidyard.com/player/{player_uuid}.json', display_id)['payload']['chapters'][0] + video_id = url_basename(vidyard_url) + info = self._process_video_json(self._fetch_video_json(video_id)['chapters'][0], video_id) + if info.get('display_id'): + info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])] + if thumbnail := self._og_search_thumbnail(webpage, default=None): + info.setdefault('thumbnails', []).append({'url': thumbnail}) - formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], display_id) return { - 'id': str(json_data['videoId']), - 'title': json_data.get('name') or self._og_search_title(webpage), - 'formats': formats, - 'subtitles': subtitles, - 'description': json_data.get('description') or self._og_search_description(webpage), - 'duration': json_data.get('seconds'), - 'tags': json_data.get('tags'), - 'thumbnail': self._og_search_thumbnail(webpage), - 'http_headers': {'Referer': 'https://play.vidyard.com/'}, + 'description': self._og_search_description(webpage, default=None), + **info, } diff --git a/yt_dlp/extractor/swearnet.py b/yt_dlp/extractor/swearnet.py index b4835c5ad..2d6fb3eb4 100644 --- a/yt_dlp/extractor/swearnet.py +++ b/yt_dlp/extractor/swearnet.py @@ -1,55 +1,31 @@ -from .common import InfoExtractor -from ..utils import ExtractorError, int_or_none, traverse_obj +from .vidyard import VidyardBaseIE +from ..utils import ExtractorError, int_or_none, make_archive_id -class SwearnetEpisodeIE(InfoExtractor): +class SwearnetEpisodeIE(VidyardBaseIE): _VALID_URL = r'https?://www\.swearnet\.com/shows/(?P[\w-]+)/seasons/(?P\d+)/episodes/(?P\d+)' _TESTS = [{ 'url': 'https://www.swearnet.com/shows/gettin-learnt-with-ricky/seasons/1/episodes/1', 'info_dict': { - 'id': '232819', + 'id': 'wicK2EOzjOdxkUXGDIgcPw', + 'display_id': '232819', 'ext': 'mp4', 'episode_number': 1, 'episode': 'Episode 1', 'duration': 719, - 'description': 'md5:c48ef71440ce466284c07085cd7bd761', + 'description': r're:Are you drunk and high and craving a grilled cheese sandwich.+', 'season': 'Season 1', 'title': 'Episode 1 - Grilled Cheese Sammich', 'season_number': 1, - 'thumbnail': 'https://cdn.vidyard.com/thumbnails/232819/_RX04IKIq60a2V6rIRqq_Q_small.jpg', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/custom/0dd74f9b-388a-452e-b570-b407fb64435b_small.jpg', + 'tags': ['Getting Learnt with Ricky', 'drunk', 'grilled cheese', 'high'], + '_old_archive_ids': ['swearnetepisode 232819'], }, }] - def _get_formats_and_subtitle(self, video_source, video_id): - video_source = video_source or {} - formats, subtitles = [], {} - for key, value in video_source.items(): - if key == 'hls': - for video_hls in value: - fmts, subs = self._extract_m3u8_formats_and_subtitles(video_hls.get('url'), video_id) - formats.extend(fmts) - self._merge_subtitles(subs, target=subtitles) - else: - formats.extend({ - 'url': video_mp4.get('url'), - 'ext': 'mp4', - } for video_mp4 in value) - - return formats, subtitles - - def _get_direct_subtitle(self, caption_json): - subs = {} - for caption in caption_json: - subs.setdefault(caption.get('language') or 'und', []).append({ - 'url': caption.get('vttUrl'), - 'name': caption.get('name'), - }) - - return subs - def _real_extract(self, url): - display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') - webpage = self._download_webpage(url, display_id) + slug, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') + webpage = self._download_webpage(url, slug) try: external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid') @@ -58,22 +34,12 @@ def _real_extract(self, url): self.raise_login_required() raise - json_data = self._download_json( - f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0] - - formats, subtitles = self._get_formats_and_subtitle(json_data['sources'], display_id) - self._merge_subtitles(self._get_direct_subtitle(json_data.get('captions')), target=subtitles) + info = self._process_video_json(self._fetch_video_json(external_id)['chapters'][0], external_id) + if info.get('display_id'): + info['_old_archive_ids'] = [make_archive_id(self, info['display_id'])] return { - 'id': str(json_data['videoId']), - 'title': json_data.get('name') or self._html_search_meta(['og:title', 'twitter:title'], webpage), - 'description': (json_data.get('description') - or self._html_search_meta(['og:description', 'twitter:description'], webpage)), - 'duration': int_or_none(json_data.get('seconds')), - 'formats': formats, - 'subtitles': subtitles, + **info, 'season_number': int_or_none(season_number), 'episode_number': int_or_none(episode_number), - 'thumbnails': [{'url': thumbnail_url} - for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))], } diff --git a/yt_dlp/extractor/vidyard.py b/yt_dlp/extractor/vidyard.py new file mode 100644 index 000000000..20a54b161 --- /dev/null +++ b/yt_dlp/extractor/vidyard.py @@ -0,0 +1,426 @@ +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + float_or_none, + int_or_none, + join_nonempty, + mimetype2ext, + parse_resolution, + str_or_none, + unescapeHTML, + url_or_none, +) +from ..utils.traversal import traverse_obj + + +class VidyardBaseIE(InfoExtractor): + _HEADERS = {'Referer': 'https://play.vidyard.com/'} + + def _get_formats_and_subtitles(self, sources, video_id): + formats, subtitles = [], {} + + def add_hls_fmts_and_subs(m3u8_url): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id='hls', headers=self._HEADERS, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + hls_list = isinstance(sources, dict) and sources.pop('hls', None) + if master_m3u8_url := traverse_obj( + hls_list, (lambda _, v: v['profile'] == 'auto', 'url', {url_or_none}, any)): + add_hls_fmts_and_subs(master_m3u8_url) + if not formats: # These are duplicate and unnecesary requests if we got 'auto' hls fmts + for variant_m3u8_url in traverse_obj(hls_list, (..., 'url', {url_or_none})): + add_hls_fmts_and_subs(variant_m3u8_url) + + for source_type, source_list in traverse_obj(sources, ({dict.items}, ...)): + for source in traverse_obj(source_list, lambda _, v: url_or_none(v['url'])): + profile = source.get('profile') + formats.append({ + 'url': source['url'], + 'ext': mimetype2ext(source.get('mimeType'), default=None), + 'format_id': join_nonempty('http', source_type, profile), + **parse_resolution(profile), + }) + + self._remove_duplicate_formats(formats) + return formats, subtitles + + def _get_direct_subtitles(self, caption_json): + subs = {} + for caption in traverse_obj(caption_json, lambda _, v: url_or_none(v['vttUrl'])): + subs.setdefault(caption.get('language') or 'und', []).append({ + 'url': caption['vttUrl'], + 'name': caption.get('name'), + }) + + return subs + + def _fetch_video_json(self, video_id): + return self._download_json( + f'https://play.vidyard.com/player/{video_id}.json', video_id)['payload'] + + def _process_video_json(self, json_data, video_id): + formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], video_id) + self._merge_subtitles(self._get_direct_subtitles(json_data.get('captions')), target=subtitles) + + return { + **traverse_obj(json_data, { + 'id': ('facadeUuid', {str}), + 'display_id': ('videoId', {int}, {str_or_none}), + 'title': ('name', {str}), + 'description': ('description', {str}, {unescapeHTML}, {lambda x: x or None}), + 'duration': (( + ('milliseconds', {functools.partial(float_or_none, scale=1000)}), + ('seconds', {int_or_none})), any), + 'thumbnails': ('thumbnailUrls', ('small', 'normal'), {'url': {url_or_none}}), + 'tags': ('tags', ..., 'name', {str}), + }), + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': self._HEADERS, + } + + +class VidyardIE(VidyardBaseIE): + _VALID_URL = [ + r'https?://[\w-]+(?:\.hubs)?\.vidyard\.com/watch/(?P[\w-]+)', + r'https?://(?:embed|share)\.vidyard\.com/share/(?P[\w-]+)', + r'https?://play\.vidyard\.com/(?:player/)?(?P[\w-]+)', + ] + _EMBED_REGEX = [r']* src=["\'](?P(?:https?:)?//play\.vidyard\.com/[\w-]+)'] + _TESTS = [{ + 'url': 'https://vyexample03.hubs.vidyard.com/watch/oTDMPlUv--51Th455G5u7Q', + 'info_dict': { + 'id': 'oTDMPlUv--51Th455G5u7Q', + 'display_id': '50347', + 'ext': 'mp4', + 'title': 'Homepage Video', + 'description': 'Look I changed the description.', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/50347/OUPa5LTKV46849sLYngMqQ_small.jpg', + 'duration': 99, + 'tags': ['these', 'are', 'all', 'tags'], + }, + }, { + 'url': 'https://share.vidyard.com/watch/PaQzDAT1h8JqB8ivEu2j6Y?', + 'info_dict': { + 'id': 'PaQzDAT1h8JqB8ivEu2j6Y', + 'display_id': '9281024', + 'ext': 'mp4', + 'title': 'Inline Embed', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', + 'duration': 41.186, + }, + }, { + 'url': 'https://embed.vidyard.com/share/oTDMPlUv--51Th455G5u7Q', + 'info_dict': { + 'id': 'oTDMPlUv--51Th455G5u7Q', + 'display_id': '50347', + 'ext': 'mp4', + 'title': 'Homepage Video', + 'description': 'Look I changed the description.', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/50347/OUPa5LTKV46849sLYngMqQ_small.jpg', + 'duration': 99, + 'tags': ['these', 'are', 'all', 'tags'], + }, + }, { + # First video from playlist below + 'url': 'https://embed.vidyard.com/share/SyStyHtYujcBHe5PkZc5DL', + 'info_dict': { + 'id': 'SyStyHtYujcBHe5PkZc5DL', + 'display_id': '41974005', + 'ext': 'mp4', + 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'description': r're:In this video, you will learn how to prepare the frame.+', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', + 'duration': 258.666, + }, + }, { + # Playlist + 'url': 'https://thelink.hubs.vidyard.com/watch/pwu7pCYWSwAnPxs8nDoFrE', + 'info_dict': { + 'id': 'pwu7pCYWSwAnPxs8nDoFrE', + 'title': 'PLAYLIST - Palm Beach Shutters- Bi-Fold Track System Installation', + 'entries': [{ + 'id': 'SyStyHtYujcBHe5PkZc5DL', + 'display_id': '41974005', + 'ext': 'mp4', + 'title': 'Prepare the Frame and Track for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/41974005/IJw7oCaJcF1h7WWu3OVZ8A_small.png', + 'duration': 258.666, + }, { + 'id': '1Fw4B84jZTXLXWqkE71RiM', + 'display_id': '5861113', + 'ext': 'mp4', + 'title': 'Palm Beach - Bi-Fold Track System "Frame Installation"', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861113/29CJ54s5g1_aP38zkKLHew_small.jpg', + 'duration': 167.858, + }, { + 'id': 'DqP3wBvLXSpxrcqpT5kEeo', + 'display_id': '41976334', + 'ext': 'mp4', + 'title': 'Install the Track for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861090/RwG2VaTylUa6KhSTED1r1Q_small.png', + 'duration': 94.229, + }, { + 'id': 'opfybfxpzQArxqtQYB6oBU', + 'display_id': '41976364', + 'ext': 'mp4', + 'title': 'Install the Panel for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860926/JIOaJR08dM4QgXi_iQ2zGA_small.png', + 'duration': 191.467, + }, { + 'id': 'rWrXvkbTNNaNqD6189HJya', + 'display_id': '41976382', + 'ext': 'mp4', + 'title': 'Adjust the Panels for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5860687/CwHxBv4UudAhOh43FVB4tw_small.png', + 'duration': 138.155, + }, { + 'id': 'eYPTB521MZ9TPEArSethQ5', + 'display_id': '41976409', + 'ext': 'mp4', + 'title': 'Assemble and Install the Valance for Palm Beach Polysatin Shutters With BiFold Track', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/5861425/0y68qlMU4O5VKU7bJ8i_AA_small.png', + 'duration': 148.224, + }], + }, + 'playlist_count': 6, + }, { + # Non hubs.vidyard.com playlist + 'url': 'https://salesforce.vidyard.com/watch/d4vqPjs7Q5EzVEis5QT3jd', + 'info_dict': { + 'id': 'd4vqPjs7Q5EzVEis5QT3jd', + 'title': 'How To: Service Cloud: Import External Content in Lightning Knowledge', + 'entries': [{ + 'id': 'mcjDpSZir2iSttbvFkx6Rv', + 'display_id': '29479036', + 'ext': 'mp4', + 'title': 'Welcome to this Expert Coaching Series', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/ouyQi9WuwyiOupChUWNmjQ/7170d3485ba602e012df05_small.jpg', + 'duration': 38.205, + }, { + 'id': '84bPYwpg243G6xYEfJdYw9', + 'display_id': '21820704', + 'ext': 'mp4', + 'title': 'Chapter 1 - Title + Agenda', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/HFPN0ZgQq4Ow8BghGcQSow/bfaa30123c8f6601e7d7f2_small.jpg', + 'duration': 98.016, + }, { + 'id': 'nP17fMuvA66buVHUrzqjTi', + 'display_id': '21820707', + 'ext': 'mp4', + 'title': 'Chapter 2 - Import Options', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/rGRIF5nFjPI9OOA2qJ_Dbg/86a8d02bfec9a566845dd4_small.jpg', + 'duration': 199.136, + }, { + 'id': 'm54EcwXdpA5gDBH5rgCYoV', + 'display_id': '21820710', + 'ext': 'mp4', + 'title': 'Chapter 3 - Importing Article Translations', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/IVX4XR8zpSsiNIHx45kz-A/1ccbf8a29a33856d06b3ed_small.jpg', + 'duration': 184.352, + }, { + 'id': 'j4nzS42oq4hE9oRV73w3eQ', + 'display_id': '21820716', + 'ext': 'mp4', + 'title': 'Chapter 4 - Best Practices', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/BtrRrQpRDLbA4AT95YQyog/1f1e6b8e7fdc3fa95ec8d3_small.jpg', + 'duration': 296.960, + }, { + 'id': 'y28PYfW5pftvers9PXzisC', + 'display_id': '21820727', + 'ext': 'mp4', + 'title': 'Chapter 5 - Migration Steps', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/K2CdQOXDfLcrVTF60r0bdw/a09239ada28b6ffce12b1f_small.jpg', + 'duration': 620.640, + }, { + 'id': 'YWU1eQxYvhj29SjYoPw5jH', + 'display_id': '21820733', + 'ext': 'mp4', + 'title': 'Chapter 6 - Demo', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/rsmhP-cO8dAa8ilvFGCX0g/7911ef415167cd14032068_small.jpg', + 'duration': 631.456, + }, { + 'id': 'nmEvVqpwdJUgb74zKsLGxn', + 'display_id': '29479037', + 'ext': 'mp4', + 'title': 'Schedule Your Follow-Up', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/Rtwc7X4PEkF4Ae5kHi-Jvw/174ebed3f34227b1ffa1d0_small.jpg', + 'duration': 33.608, + }], + }, + 'playlist_count': 8, + }, { + # URL of iframe embed src + 'url': 'https://play.vidyard.com/iDqTwWGrd36vaLuaCY3nTs.html', + 'info_dict': { + 'id': 'iDqTwWGrd36vaLuaCY3nTs', + 'display_id': '9281009', + 'ext': 'mp4', + 'title': 'Lightbox Embed', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/spacer.gif', + 'duration': 39.035, + }, + }, { + # Player JSON URL + 'url': 'https://play.vidyard.com/player/7GAApnNNbcZZ46k6JqJQSh.json?disable_analytics=0', + 'info_dict': { + 'id': '7GAApnNNbcZZ46k6JqJQSh', + 'display_id': '820026', + 'ext': 'mp4', + 'title': 'The Art of Storytelling: How to Deliver Your Brand Story with Content & Social', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/MhbE-5sEFQu4x3fI6FkNlA/41eb5717c557cd19456910_small.jpg', + 'duration': 2153.013, + 'tags': ['Summit2017'], + }, + }, { + 'url': 'http://share.vidyard.com/share/diYeo6YR2yiGgL8odvS8Ri', + 'only_matching': True, + }, { + 'url': 'https://play.vidyard.com/FFlz3ZpxhIfKQ1fd9DAryA', + 'only_matching': True, + }, { + 'url': 'https://play.vidyard.com/qhMAu5A76GZVrFzOPgSf9A/type/standalone', + 'only_matching': True, + }] + _WEBPAGE_TESTS = [{ + # URL containing inline/lightbox embedded video + 'url': 'https://resources.altium.com/p/2-the-extreme-importance-of-pc-board-stack-up', + 'info_dict': { + 'id': 'GDx1oXrFWj4XHbipfoXaMn', + 'display_id': '3225198', + 'ext': 'mp4', + 'title': 'The Extreme Importance of PC Board Stack Up', + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/73_Q3_hBexWX7Og1sae6cg/9998fa4faec921439e2c04_small.jpg', + 'duration': 3422.742, + }, + }, { + # ', - webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + def yield_all_relay_data(_filter): + for relay_data in re.findall(rf'data-sjs>({{.*?{_filter}.*?}})', webpage): + yield self._parse_json(relay_data, video_id, fatal=False) or {} - def extract_relay_prefetched_data(_filter): - return traverse_obj(extract_relay_data(_filter), ( - 'require', (None, (..., ..., ..., '__bbox', 'require')), + def extract_relay_data(_filter): + return next(filter(None, yield_all_relay_data(_filter)), {}) + + def extract_relay_prefetched_data(_filter, target_keys=None): + path = 'data' + if target_keys is not None: + path = lambda k, v: k == 'data' and any(target in v for target in variadic(target_keys)) + return traverse_obj(yield_all_relay_data(_filter), ( + ..., 'require', (None, (..., ..., ..., '__bbox', 'require')), lambda _, v: any(key.startswith('RelayPrefetchedStreamCache') for key in v), - ..., ..., '__bbox', 'result', 'data', {dict}), get_all=False) or {} + ..., ..., '__bbox', 'result', path, {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ @@ -591,7 +596,8 @@ def extract_relay_prefetched_data(_filter): if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)', + target_keys=('video', 'event', 'nodes', 'node', 'mediaset')) if data: entries = [] From f0993391e6052ec8f7aacc286609564f226943b9 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 24 Jul 2024 16:22:55 -0500 Subject: [PATCH 62/95] [ie/mlbtv] Fix extractor (#10515) Closes #10510 Authored by: bashonly --- yt_dlp/extractor/mlb.py | 223 ++++++++++++++++++++++++++++++++-------- 1 file changed, 180 insertions(+), 43 deletions(-) diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 6f67602a6..230c218e7 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -1,16 +1,21 @@ +import json import re -import urllib.parse +import time import uuid from .common import InfoExtractor +from ..networking.exceptions import HTTPError from ..utils import ( + ExtractorError, determine_ext, int_or_none, join_nonempty, + jwt_decode_hs256, parse_duration, parse_iso8601, try_get, url_or_none, + urlencode_postdata, ) from ..utils.traversal import traverse_obj @@ -276,81 +281,213 @@ def _download_video_data(self, display_id): class MLBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P\d{6})' _NETRC_MACHINE = 'mlb' - _TESTS = [{ 'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638', 'info_dict': { 'id': '661581', 'ext': 'mp4', 'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies', + 'release_date': '20220702', + 'release_timestamp': 1656792300, }, 'params': { 'skip_download': True, }, }] + _GRAPHQL_INIT_QUERY = '''\ +mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) { + initSession(device: $device, clientType: $clientType, experience: $experience) { + deviceId + sessionId + entitlements { + code + } + location { + countryCode + regionName + zipCode + latitude + longitude + } + clientExperience + features + } + }''' + _GRAPHQL_PLAYBACK_QUERY = '''\ +mutation initPlaybackSession( + $adCapabilities: [AdExperienceType] + $mediaId: String! + $deviceId: String! + $sessionId: String! + $quality: PlaybackQuality + ) { + initPlaybackSession( + adCapabilities: $adCapabilities + mediaId: $mediaId + deviceId: $deviceId + sessionId: $sessionId + quality: $quality + ) { + playbackSessionId + playback { + url + token + expiration + cdn + } + } + }''' + _APP_VERSION = '7.8.2' + _device_id = None + _session_id = None _access_token = None + _token_expiry = 0 + + @property + def _api_headers(self): + if (self._token_expiry - 120) <= time.time(): + self.write_debug('Access token has expired; re-logging in') + self._perform_login(*self._get_login_info()) + return {'Authorization': f'Bearer {self._access_token}'} def _real_initialize(self): if not self._access_token: self.raise_login_required( 'All videos are only available to registered users', method='password') + def _set_device_id(self, username): + if not self._device_id: + self._device_id = self.cache.load( + self._NETRC_MACHINE, 'device_ids', default={}).get(username) + if self._device_id: + return + self._device_id = str(uuid.uuid4()) + self.cache.store(self._NETRC_MACHINE, 'device_ids', {username: self._device_id}) + def _perform_login(self, username, password): - data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356' - access_token = self._download_json( - 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, - headers={ - 'User-Agent': 'okhttp/3.12.1', - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=data.encode())['access_token'] + try: + self._access_token = self._download_json( + 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, + 'Logging in', 'Unable to log in', headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Content-Type': 'application/x-www-form-urlencoded', + }, data=urlencode_postdata({ + 'grant_type': 'password', + 'username': username, + 'password': password, + 'scope': 'openid offline_access', + 'client_id': '0oa3e1nutA1HLzAKG356', + }))['access_token'] + except ExtractorError as error: + if isinstance(error.cause, HTTPError) and error.cause.status == 400: + raise ExtractorError('Invalid username or password', expected=True) + raise - entitlement = self._download_webpage( - f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={uuid.uuid4()}', None, - headers={ - 'User-Agent': 'okhttp/3.12.1', - 'Authorization': f'Bearer {access_token}', - }) + self._token_expiry = traverse_obj(self._access_token, ({jwt_decode_hs256}, 'exp', {int})) or 0 + self._set_device_id(username) - data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv' - self._access_token = self._download_json( - 'https://us.edge.bamgrid.com/token', None, + self._session_id = self._call_api({ + 'operationName': 'initSession', + 'query': self._GRAPHQL_INIT_QUERY, + 'variables': { + 'device': { + 'appVersion': self._APP_VERSION, + 'deviceFamily': 'desktop', + 'knownDeviceId': self._device_id, + 'languagePreference': 'ENGLISH', + 'manufacturer': '', + 'model': '', + 'os': '', + 'osVersion': '', + }, + 'clientType': 'WEB', + }, + }, None, 'session ID')['data']['initSession']['sessionId'] + + def _call_api(self, data, video_id, description='GraphQL JSON', fatal=True): + return self._download_json( + 'https://media-gateway.mlb.com/graphql', video_id, + f'Downloading {description}', f'Unable to download {description}', fatal=fatal, headers={ + **self._api_headers, 'Accept': 'application/json', - 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk', - 'Content-Type': 'application/x-www-form-urlencoded', - }, data=data.encode())['access_token'] + 'Content-Type': 'application/json', + 'x-client-name': 'WEB', + 'x-client-version': self._APP_VERSION, + }, data=json.dumps(data, separators=(',', ':')).encode()) + + def _extract_formats_and_subtitles(self, broadcast, video_id): + feed = traverse_obj(broadcast, ('homeAway', {str.title})) + medium = traverse_obj(broadcast, ('type', {str})) + language = traverse_obj(broadcast, ('language', {str.lower})) + format_id = join_nonempty(feed, medium, language) + + response = self._call_api({ + 'operationName': 'initPlaybackSession', + 'query': self._GRAPHQL_PLAYBACK_QUERY, + 'variables': { + 'adCapabilities': ['GOOGLE_STANDALONE_AD_PODS'], + 'deviceId': self._device_id, + 'mediaId': broadcast['mediaId'], + 'quality': 'PLACEHOLDER', + 'sessionId': self._session_id, + }, + }, video_id, f'{format_id} broadcast JSON', fatal=False) + + playback = traverse_obj(response, ('data', 'initPlaybackSession', 'playback', {dict})) + m3u8_url = traverse_obj(playback, ('url', {url_or_none})) + token = traverse_obj(playback, ('token', {str})) + + if not (m3u8_url and token): + errors = '; '.join(traverse_obj(response, ('errors', ..., 'message', {str}))) + if 'not entitled' in errors: + raise ExtractorError(errors, expected=True) + elif errors: # Only warn when 'blacked out' since radio formats are available + self.report_warning(f'API returned errors for {format_id}: {errors}') + else: + self.report_warning(f'No formats available for {format_id} broadcast; skipping') + return [], {} + + cdn_headers = {'x-cdn-token': token} + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url.replace(f'/{token}/', '/'), video_id, 'mp4', + m3u8_id=format_id, fatal=False, headers=cdn_headers) + for fmt in fmts: + fmt['http_headers'] = cdn_headers + fmt.setdefault('format_note', join_nonempty(feed, medium, delim=' ')) + fmt.setdefault('language', language) + if fmt.get('vcodec') == 'none' and fmt['language'] == 'en': + fmt['source_preference'] = 10 + + return fmts, subs def _real_extract(self, url): video_id = self._match_id(url) - airings = self._download_json( - f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D', - video_id)['data']['Airings'] + metadata = traverse_obj(self._download_json( + 'https://statsapi.mlb.com/api/v1/schedule', video_id, query={ + 'gamePk': video_id, + 'hydrate': 'broadcasts(all),statusFlags', + }), ('dates', ..., 'games', lambda _, v: str(v['gamePk']) == video_id and v['broadcasts'], any)) + + broadcasts = traverse_obj(metadata, ( + 'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF')) formats, subtitles = [], {} - for airing in traverse_obj(airings, lambda _, v: v['playbackUrls'][0]['href']): - format_id = join_nonempty('feedType', 'feedLanguage', from_dict=airing) - m3u8_url = traverse_obj(self._download_json( - airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id, - note=f'Downloading {format_id} stream info JSON', - errnote=f'Failed to download {format_id} stream info, skipping', - fatal=False, headers={ - 'Authorization': self._access_token, - 'Accept': 'application/vnd.media-service+json; version=2', - }), ('stream', 'complete', {url_or_none})) - if not m3u8_url: - continue - f, s = self._extract_m3u8_formats_and_subtitles( - m3u8_url, video_id, 'mp4', m3u8_id=format_id, fatal=False) - formats.extend(f) - self._merge_subtitles(s, target=subtitles) + for broadcast in broadcasts: + fmts, subs = self._extract_formats_and_subtitles(broadcast, video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) return { 'id': video_id, - 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False), - 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE', + 'title': join_nonempty( + traverse_obj(metadata, ('officialDate', {str})), + traverse_obj(metadata, ('teams', ('away', 'home'), 'team', 'name', {str}, all, {' @ '.join})), + delim=' - '), + 'is_live': traverse_obj(broadcasts, (..., 'mediaState', 'mediaStateCode', {str}, any)) == 'MEDIA_ON', + 'release_timestamp': traverse_obj(metadata, ('gameDate', {parse_iso8601})), 'formats': formats, 'subtitles': subtitles, - 'http_headers': {'Authorization': f'Bearer {self._access_token}'}, } From 6b1e430d8e4af56cd4fcb8bdc00fca9b79356464 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 25 Jul 2024 03:29:27 +0000 Subject: [PATCH 63/95] Release 2024.07.25 Created by: bashonly :ci skip all :ci run dl --- Changelog.md | 13 +++++++++++++ yt_dlp/version.py | 6 +++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/Changelog.md b/Changelog.md index 194d75e8a..b2cad7dc4 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,19 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.07.25 + +#### Extractor changes +- **abematv**: [Adapt key retrieval to request handler framework](https://github.com/yt-dlp/yt-dlp/commit/a3bab4752a2b3d56e5a59b4e0411bb8f695c010b) ([#10491](https://github.com/yt-dlp/yt-dlp/issues/10491)) by [bashonly](https://github.com/bashonly) +- **facebook**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/1a34a802f44a1dab8f642c79c3cc810e21541d3b) ([#10531](https://github.com/yt-dlp/yt-dlp/issues/10531)) by [bashonly](https://github.com/bashonly) +- **mlbtv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/f0993391e6052ec8f7aacc286609564f226943b9) ([#10515](https://github.com/yt-dlp/yt-dlp/issues/10515)) by [bashonly](https://github.com/bashonly) +- **tiktok**: [Fix and deprioritize JSON subtitles](https://github.com/yt-dlp/yt-dlp/commit/2f97779f335ac069ecccd9c7bf81abf4a83cfe7a) ([#10516](https://github.com/yt-dlp/yt-dlp/issues/10516)) by [bashonly](https://github.com/bashonly) +- **vimeo**: [Fix chapters extraction](https://github.com/yt-dlp/yt-dlp/commit/a0a1bc3d8d8e3bb9a48a06e835815a0460e90e77) ([#10544](https://github.com/yt-dlp/yt-dlp/issues/10544)) by [bashonly](https://github.com/bashonly) +- **youtube**: [Fix `n` function name extraction for player `3400486c`](https://github.com/yt-dlp/yt-dlp/commit/713b4cd18f00556771af8cfdd9cea6cc1a09e948) ([#10542](https://github.com/yt-dlp/yt-dlp/issues/10542)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **build**: [Pin `setuptools` version](https://github.com/yt-dlp/yt-dlp/commit/e046db8a116b1c320d4785daadd48ea0b22a3987) ([#10493](https://github.com/yt-dlp/yt-dlp/issues/10493)) by [bashonly](https://github.com/bashonly) + ### 2024.07.16 #### Core changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index db5b342d1..e641bf5ae 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.07.16' +__version__ = '2024.07.25' -RELEASE_GIT_HEAD = '89a161e8c62569a662deda1c948664152efcb6b4' +RELEASE_GIT_HEAD = 'f0993391e6052ec8f7aacc286609564f226943b9' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.07.16' +_pkg_version = '2024.07.25' From 0b7728618417e1aa382722a4d29b916b594d4459 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 25 Jul 2024 17:00:58 -0500 Subject: [PATCH 64/95] [ie/DiscoveryPlus] Support olympics URLs (#10566) Closes #10564 Authored by: bashonly --- yt_dlp/extractor/dplay.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index e9f9357ad..cdf84c52d 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -934,7 +934,7 @@ class TLCIE(DiscoveryPlusBaseIE): class DiscoveryPlusIE(DiscoveryPlusBaseIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:(?P[a-z]{2})/)?video(?:/sport)?' + DPlayBaseIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:(?P[a-z]{2})/)?video(?:/sport|/olympics)?' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -958,6 +958,9 @@ class DiscoveryPlusIE(DiscoveryPlusBaseIE): }, { 'url': 'https://www.discoveryplus.com/gb/video/sport/eurosport-1-british-eurosport-1-british-sport/6-hours-of-spa-review', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/gb/video/olympics/dplus-sport-dplus-sport-sport/rugby-sevens-australia-samoa', + 'only_matching': True, }] _PRODUCT = None From 28d485714fef88937c82635438afba5db81f9089 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 25 Jul 2024 17:30:00 -0500 Subject: [PATCH 65/95] [ie/tva] Fix extractor (#10567) Closes #10555 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 5 +-- yt_dlp/extractor/tva.py | 72 +++++++++++++-------------------- yt_dlp/extractor/unsupported.py | 4 ++ 3 files changed, 32 insertions(+), 49 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d2140bc52..dcce7ffa7 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2169,10 +2169,7 @@ TV5UnisVideoIE, ) from .tv24ua import TV24UAVideoIE -from .tva import ( - TVAIE, - QubIE, -) +from .tva import TVAIE from .tvanouvelles import ( TVANouvellesArticleIE, TVANouvellesIE, diff --git a/yt_dlp/extractor/tva.py b/yt_dlp/extractor/tva.py index e3e10557c..d702640f3 100644 --- a/yt_dlp/extractor/tva.py +++ b/yt_dlp/extractor/tva.py @@ -1,60 +1,29 @@ import functools import re +from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import float_or_none, int_or_none, smuggle_url, strip_or_none from ..utils.traversal import traverse_obj class TVAIE(InfoExtractor): - _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P\d+)' + IE_NAME = 'tvaplus' + IE_DESC = 'TVA+' + _VALID_URL = r'https?://(?:www\.)?tvaplus\.ca/(?:[^/?#]+/)*[\w-]+-(?P\d+)(?:$|[#?])' _TESTS = [{ - 'url': 'https://videos.tva.ca/details/_5596811470001', - 'info_dict': { - 'id': '5596811470001', - 'ext': 'mp4', - 'title': 'Un extrait de l\'épisode du dimanche 8 octobre 2017 !', - 'uploader_id': '5481942443001', - 'upload_date': '20171003', - 'timestamp': 1507064617, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'https://video.tva.ca/details/_5596811470001', - 'only_matching': True, - }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s' - - def _real_extract(self, url): - video_id = self._match_id(url) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'ie_key': 'BrightcoveNew', - } - - -class QubIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P\d+)' - _TESTS = [{ - 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'url': 'https://www.tvaplus.ca/tva/alerte-amber/saison-1/episode-01-1000036619', 'md5': '949490fd0e7aee11d0543777611fbd53', 'info_dict': { 'id': '6084352463001', 'ext': 'mp4', - 'title': 'Ép 01. Mon dernier jour', + 'title': 'Mon dernier jour', 'uploader_id': '5481942443001', 'upload_date': '20190907', 'timestamp': 1567899756, 'description': 'md5:9c0d7fbb90939420c651fd977df90145', 'thumbnail': r're:https://.+\.jpg', - 'episode': 'Ép 01. Mon dernier jour', + 'episode': 'Mon dernier jour', 'episode_number': 1, 'tags': ['alerte amber', 'alerte amber saison 1', 'surdemande'], 'duration': 2625.963, @@ -64,23 +33,36 @@ class QubIE(InfoExtractor): 'channel': 'TVA', }, }, { - 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', - 'only_matching': True, + 'url': 'https://www.tvaplus.ca/tva/le-baiser-du-barbu/le-baiser-du-barbu-886644190', + 'info_dict': { + 'id': '6354448043112', + 'ext': 'mp4', + 'title': 'Le Baiser du barbu', + 'uploader_id': '5481942443001', + 'upload_date': '20240606', + 'timestamp': 1717694023, + 'description': 'md5:025b1219086c1cbf4bc27e4e034e8b57', + 'thumbnail': r're:https://.+\.jpg', + 'episode': 'Le Baiser du barbu', + 'tags': ['fullepisode', 'films'], + 'duration': 6053.504, + 'series': 'Le Baiser du barbu', + 'channel': 'TVA', + }, }] - # reference_id also works with old account_id(5481942443001) - # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + _BC_URL_TMPL = 'https://players.brightcove.net/5481942443001/default_default/index.html?videoId={}' def _real_extract(self, url): entity_id = self._match_id(url) webpage = self._download_webpage(url, entity_id) - entity = self._search_nextjs_data(webpage, entity_id)['props']['initialProps']['pageProps']['fallbackData'] + entity = self._search_nextjs_data(webpage, entity_id)['props']['pageProps']['staticEntity'] video_id = entity['videoId'] episode = strip_or_none(entity.get('name')) return { '_type': 'url_transparent', - 'url': f'https://videos.tva.ca/details/_{video_id}', - 'ie_key': TVAIE.ie_key(), + 'url': smuggle_url(self._BC_URL_TMPL.format(video_id), {'geo_countries': ['CA']}), + 'ie_key': BrightcoveNewIE.ie_key(), 'id': video_id, 'title': episode, 'episode': episode, diff --git a/yt_dlp/extractor/unsupported.py b/yt_dlp/extractor/unsupported.py index 1e2d118aa..8b7ec1dd9 100644 --- a/yt_dlp/extractor/unsupported.py +++ b/yt_dlp/extractor/unsupported.py @@ -49,6 +49,7 @@ class KnownDRMIE(UnsupportedInfoExtractor): r'amazon\.(?:\w{2}\.)?\w+/gp/video', r'music\.amazon\.(?:\w{2}\.)?\w+', r'(?:watch|front)\.njpwworld\.com', + r'qub\.ca/vrai', ) _TESTS = [{ @@ -149,6 +150,9 @@ class KnownDRMIE(UnsupportedInfoExtractor): }, { 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', 'only_matching': True, + }, { + 'url': 'https://www.qub.ca/vrai/l-effet-bocuse-d-or/saison-1/l-effet-bocuse-d-or-saison-1-bande-annonce-1098225063', + 'only_matching': True, }] def _real_extract(self, url): From 6daf2c27c0464fba98337be30de0b66d520d0db1 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 29 Jul 2024 00:35:46 -0500 Subject: [PATCH 66/95] [utils] `unified_timestamp`: Recognize Sunday (#10589) Authored by: bashonly --- test/test_utils.py | 2 ++ yt_dlp/utils/_utils.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test_utils.py b/test/test_utils.py index e82f551bc..a2b459352 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -444,6 +444,8 @@ def test_unified_timestamps(self): self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('Sunday, 26 Nov 2006, 19:00'), 1164567600) + self.assertEqual(unified_timestamp('wed, aug 16, 2008, 12:00pm'), 1218931200) self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 3e3b285a4..0d3e707c5 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1217,7 +1217,7 @@ def unified_timestamp(date_str, day_first=True): return None date_str = re.sub(r'\s+', ' ', re.sub( - r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?|sun)(day)?', '', date_str)) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) From ef36d517f9b05785d61abca7691d9ab7d63cc75c Mon Sep 17 00:00:00 2001 From: middlingphys <38708390+middlingphys@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:54:59 +0900 Subject: [PATCH 67/95] [ie/abematv] Fix availability extraction (#10569) Authored by: middlingphys --- yt_dlp/extractor/abematv.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 2611c6fdd..66ab083fe 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -377,8 +377,7 @@ def _real_extract(self, url): f'https://api.abema.io/v1/video/programs/{video_id}', video_id, note='Checking playability', headers=headers) - ondemand_types = traverse_obj(api_response, ('terms', ..., 'onDemandType')) - if 3 not in ondemand_types: + if not traverse_obj(api_response, ('label', 'free', {bool})): # cannot acquire decryption key for these streams self.report_warning('This is a premium-only stream') availability = 'premium_only' From 2b6df93a243bdfb9d6bb5c1e18020625cd02d465 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:55:06 -0500 Subject: [PATCH 68/95] [ie/vimeo:review] Fix password-protected video extraction (#10598) Closes #10255 Authored by: bashonly --- yt_dlp/extractor/vimeo.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index d10689cd8..a20cf4b17 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -1267,7 +1267,7 @@ class VimeoGroupsIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE class VimeoReviewIE(VimeoBaseInfoExtractor): IE_NAME = 'vimeo:review' IE_DESC = 'Review pages on vimeo' - _VALID_URL = r'(?Phttps://vimeo\.com/[^/]+/review/(?P[^/]+)/[0-9a-f]{10})' + _VALID_URL = r'https?://vimeo\.com/(?P[^/?#]+)/review/(?P\d+)/(?P[\da-f]{10})' _TESTS = [{ 'url': 'https://vimeo.com/user21297594/review/75524534/3c257a1b5d', 'md5': 'c507a72f780cacc12b2248bb4006d253', @@ -1313,26 +1313,22 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): }] def _real_extract(self, url): - page_url, video_id = self._match_valid_url(url).groups() - data = self._download_json( - page_url.replace('/review/', '/review/data/'), video_id) + user, video_id, review_hash = self._match_valid_url(url).group('user', 'id', 'hash') + data_url = f'https://vimeo.com/{user}/review/data/{video_id}/{review_hash}' + data = self._download_json(data_url, video_id) if data.get('isLocked') is True: video_password = self._get_video_password() viewer = self._download_json( 'https://vimeo.com/_rv/viewer', video_id) - webpage = self._verify_video_password(video_id, video_password, viewer['xsrft']) - clip_page_config = self._parse_json(self._search_regex( - r'window\.vimeo\.clip_page_config\s*=\s*({.+?});', - webpage, 'clip page config'), video_id) - config_url = clip_page_config['player']['config_url'] - clip_data = clip_page_config.get('clip') or {} - else: - clip_data = data['clipData'] - config_url = clip_data['configUrl'] + self._verify_video_password(video_id, video_password, viewer['xsrft']) + data = self._download_json(data_url, video_id) + clip_data = data['clipData'] + config_url = clip_data['configUrl'] config = self._download_json(config_url, video_id) info_dict = self._parse_config(config, video_id) source_format = self._extract_original_format( - page_url + '/action', video_id) + f'https://vimeo.com/{user}/review/{video_id}/{review_hash}/action', video_id, + unlisted_hash=traverse_obj(config_url, ({parse_qs}, 'h', -1))) if source_format: info_dict['formats'].append(source_format) info_dict['description'] = clean_html(clip_data.get('description')) From 94a1c5e642e468cebeb51f74c6c220434cb47d96 Mon Sep 17 00:00:00 2001 From: trainman261 Date: Mon, 29 Jul 2024 23:58:26 +0200 Subject: [PATCH 69/95] [ie/cbc.ca:player] Fix extractor (#10302) Closes #10170 Authored by: trainman261, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/cbc.py | 269 ++++++++++++++++++++++++++++++++-------- 1 file changed, 214 insertions(+), 55 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 1522b08e2..373c9d2c9 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -1,4 +1,5 @@ import base64 +import functools import json import re import time @@ -6,17 +7,24 @@ import xml.etree.ElementTree from .common import InfoExtractor +from ..networking import HEADRequest from ..utils import ( ExtractorError, + float_or_none, int_or_none, join_nonempty, js_to_json, + mimetype2ext, orderedSet, parse_iso8601, + replace_extension, smuggle_url, strip_or_none, traverse_obj, try_get, + update_url, + url_basename, + url_or_none, ) @@ -149,6 +157,7 @@ def _real_extract(self, url): class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/(?:video/)?|i/caffeine/syndicate/\?mediaId=))(?P(?:\d\.)?\d+)' + _GEO_COUNTRIES = ['CA'] _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', @@ -172,21 +181,20 @@ class CBCPlayerIE(InfoExtractor): 'description': 'md5:dd3b692f0a139b0369943150bd1c46a9', 'timestamp': 1425704400, 'upload_date': '20150307', - 'uploader': 'CBCC-NEW', - 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg', 'chapters': [], 'duration': 494.811, - 'categories': ['AudioMobile/All in a Weekend Montreal'], - 'tags': 'count:8', + 'categories': ['All in a Weekend Montreal'], + 'tags': 'count:11', 'location': 'Quebec', 'series': 'All in a Weekend Montreal', 'season': 'Season 2015', 'season_number': 2015, 'media_type': 'Excerpt', + 'genres': ['Other'], }, }, { 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062', - 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', 'ext': 'mp4', @@ -194,107 +202,168 @@ class CBCPlayerIE(InfoExtractor): 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, 'upload_date': '20111104', - 'uploader': 'CBCC-NEW', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg', 'chapters': [], 'duration': 186.867, 'series': 'CBC News: Windsor at 6:00', - 'categories': ['News/Canada/Windsor'], + 'categories': ['Windsor'], 'location': 'Windsor', - 'tags': ['cancer'], - 'creators': ['Allison Johnson'], + 'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'], 'media_type': 'Excerpt', + 'genres': ['News'], }, + 'params': {'skip_download': 'm3u8'}, }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'https://www.cbc.ca/player/play/1.2985700', 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', 'info_dict': { - 'id': '2657631896', + 'id': '1.2985700', 'ext': 'mp3', 'title': 'CBC Montreal is organizing its first ever community hackathon!', 'description': 'The modern technology we tend to depend on so heavily, is never without it\'s share of hiccups and headaches. Next weekend - CBC Montreal will be getting members of the public for its first Hackathon.', 'timestamp': 1425704400, 'upload_date': '20150307', - 'uploader': 'CBCC-NEW', - 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'thumbnail': 'https://i.cbc.ca/ais/1.2985700,1717262248558/full/max/0/default.jpg', 'chapters': [], 'duration': 494.811, - 'categories': ['AudioMobile/All in a Weekend Montreal'], - 'tags': 'count:8', + 'categories': ['All in a Weekend Montreal'], + 'tags': 'count:11', 'location': 'Quebec', 'series': 'All in a Weekend Montreal', 'season': 'Season 2015', 'season_number': 2015, 'media_type': 'Excerpt', + 'genres': ['Other'], }, }, { 'url': 'https://www.cbc.ca/player/play/1.1711287', - 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { - 'id': '2164402062', + 'id': '1.1711287', 'ext': 'mp4', 'title': 'Cancer survivor four times over', 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, 'upload_date': '20111104', - 'uploader': 'CBCC-NEW', - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'thumbnail': 'https://i.cbc.ca/ais/1.1711287,1717139372111/full/max/0/default.jpg', 'chapters': [], 'duration': 186.867, 'series': 'CBC News: Windsor at 6:00', - 'categories': ['News/Canada/Windsor'], + 'categories': ['Windsor'], 'location': 'Windsor', - 'tags': ['cancer'], - 'creators': ['Allison Johnson'], + 'tags': ['Cancer', 'News/Canada/Windsor', 'Windsor'], 'media_type': 'Excerpt', + 'genres': ['News'], }, + 'params': {'skip_download': 'm3u8'}, }, { # Has subtitles # These broadcasts expire after ~1 month, can find new test URL here: # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast - 'url': 'https://www.cbc.ca/player/play/1.7159484', - 'md5': '6ed6cd0fc2ef568d2297ba68a763d455', + 'url': 'https://www.cbc.ca/player/play/video/9.6424403', + 'md5': '8025909eaffcf0adf59922904def9a5e', 'info_dict': { - 'id': '2324213316001', + 'id': '9.6424403', 'ext': 'mp4', - 'title': 'The National | School boards sue social media giants', - 'description': 'md5:4b4db69322fa32186c3ce426da07402c', - 'timestamp': 1711681200, - 'duration': 2743.400, - 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg', - 'uploader': 'CBCC-NEW', + 'title': 'The National | N.W.T. wildfire emergency', + 'description': 'md5:ada33d36d1df69347ed575905bfd496c', + 'timestamp': 1718589600, + 'duration': 2692.833, + 'subtitles': { + 'en-US': [{ + 'name': 'English Captions', + 'url': 'https://cbchls.akamaized.net/delivery/news-shows/2024/06/17/NAT_JUN16-00-55-00/NAT_JUN16_cc.vtt', + }], + }, + 'thumbnail': 'https://i.cbc.ca/ais/6272b5c6-5e78-4c05-915d-0e36672e33d1,1714756287822/full/max/0/default.jpg', 'chapters': 'count:5', - 'upload_date': '20240329', - 'categories': 'count:4', + 'upload_date': '20240617', + 'categories': ['News', 'The National', 'The National Latest Broadcasts'], 'series': 'The National - Full Show', - 'tags': 'count:1', - 'creators': ['News'], + 'tags': ['The National'], 'location': 'Canada', 'media_type': 'Full Program', + 'genres': ['News'], }, }, { 'url': 'https://www.cbc.ca/player/play/video/1.7194274', 'md5': '188b96cf6bdcb2540e178a6caa957128', 'info_dict': { - 'id': '2334524995812', + 'id': '1.7194274', 'ext': 'mp4', 'title': '#TheMoment a rare white spirit moose was spotted in Alberta', 'description': 'md5:18ae269a2d0265c5b0bbe4b2e1ac61a3', 'timestamp': 1714788791, 'duration': 77.678, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/201/543/THE_MOMENT.jpg', - 'uploader': 'CBCC-NEW', - 'chapters': 'count:0', - 'upload_date': '20240504', + 'thumbnail': 'https://i.cbc.ca/ais/1.7194274,1717224990425/full/max/0/default.jpg', + 'chapters': [], 'categories': 'count:3', 'series': 'The National', - 'tags': 'count:15', - 'creators': ['encoder'], + 'tags': 'count:17', 'location': 'Canada', 'media_type': 'Excerpt', + 'upload_date': '20240504', + 'genres': ['News'], + }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/9.6427282', + 'info_dict': { + 'id': '9.6427282', + 'ext': 'mp4', + 'title': 'Men\'s Soccer - Argentina vs Morocco', + 'description': 'Argentina faces Morocco on the football pitch at Saint Etienne Stadium.', + 'series': 'CBC Sports', + 'media_type': 'Event Coverage', + 'thumbnail': 'https://i.cbc.ca/ais/a4c5c0c2-99fa-4bd3-8061-5a63879c1b33,1718828053500/full/max/0/default.jpg', + 'timestamp': 1721825400.0, + 'upload_date': '20240724', + 'duration': 10568.0, + 'chapters': [], + 'genres': [], + 'tags': ['2024 Paris Olympic Games'], + 'categories': ['Olympics Summer Soccer', 'Summer Olympics Replays', 'Summer Olympics Soccer Replays'], + 'location': 'Canada', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.cbc.ca/player/play/video/9.6459530', + 'md5': '6c1bb76693ab321a2e99c347a1d5ecbc', + 'info_dict': { + 'id': '9.6459530', + 'ext': 'mp4', + 'title': 'Parts of Jasper incinerated as wildfire rages', + 'description': 'md5:6f1caa8d128ad3f629257ef5fecf0962', + 'series': 'The National', + 'media_type': 'Excerpt', + 'thumbnail': 'https://i.cbc.ca/ais/507c0086-31a2-494d-96e4-bffb1048d045,1721953984375/full/max/0/default.jpg', + 'timestamp': 1721964091.012, + 'upload_date': '20240726', + 'duration': 952.285, + 'chapters': [], + 'genres': [], + 'tags': 'count:23', + 'categories': ['News (FAST)', 'News', 'The National', 'TV News Shows', 'The National '], + }, + }, { + 'url': 'https://www.cbc.ca/player/play/video/9.6420651', + 'md5': '71a850c2c6ee5e912de169f5311bb533', + 'info_dict': { + 'id': '9.6420651', + 'ext': 'mp4', + 'title': 'Is it a breath of fresh air? Measuring air quality in Edmonton', + 'description': 'md5:3922b92cc8b69212d739bd9dd095b1c3', + 'series': 'CBC News Edmonton', + 'media_type': 'Excerpt', + 'thumbnail': 'https://i.cbc.ca/ais/73c4ab9c-7ad4-46ee-bb9b-020fdc01c745,1718214547576/full/max/0/default.jpg', + 'timestamp': 1718220065.768, + 'upload_date': '20240612', + 'duration': 286.086, + 'chapters': [], + 'genres': ['News'], + 'categories': ['News', 'Edmonton'], + 'tags': 'count:7', + 'location': 'Edmonton', }, }, { 'url': 'cbcplayer:1.7159484', @@ -307,23 +376,113 @@ class CBCPlayerIE(InfoExtractor): 'only_matching': True, }] + def _parse_param(self, asset_data, name): + return traverse_obj(asset_data, ('params', lambda _, v: v['name'] == name, 'value', {str}, any)) + def _real_extract(self, url): video_id = self._match_id(url) - if '.' in video_id: - webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) - video_id = self._search_json( - r'window\.__INITIAL_STATE__\s*=', webpage, - 'initial state', video_id)['video']['currentClip']['mediaId'] + webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) + data = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id)['video']['currentClip'] + assets = traverse_obj( + data, ('media', 'assets', lambda _, v: url_or_none(v['key']) and v['type'])) + + if not assets and (media_id := traverse_obj(data, ('mediaId', {str}))): + # XXX: Deprecated; CBC is migrating off of ThePlatform + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url( + f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{media_id}?mbr=true&formats=MPEG4,FLV,MP3', { + 'force_smil_url': True, + }), + 'id': media_id, + '_format_sort_fields': ('res', 'proto'), # Prioritize direct http formats over HLS + } + + is_live = traverse_obj(data, ('media', 'streamType', {str})) == 'Live' + formats, subtitles = [], {} + + for sub in traverse_obj(data, ('media', 'textTracks', lambda _, v: url_or_none(v['src']))): + subtitles.setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['src'], + 'name': sub.get('label'), + }) + + for asset in assets: + asset_key = asset['key'] + asset_type = asset['type'] + if asset_type != 'medianet': + self.report_warning(f'Skipping unsupported asset type "{asset_type}": {asset_key}') + continue + asset_data = self._download_json(asset_key, video_id, f'Downloading {asset_type} JSON') + ext = mimetype2ext(self._parse_param(asset_data, 'contentType')) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_data['url'], video_id, 'mp4', m3u8_id='hls', live=is_live) + formats.extend(fmts) + # Avoid slow/error-prone webvtt-over-m3u8 if direct https vtt is available + if not subtitles: + self._merge_subtitles(subs, target=subtitles) + if is_live or not fmts: + continue + # Check for direct https mp4 format + best_video_fmt = traverse_obj(fmts, ( + lambda _, v: v.get('vcodec') != 'none' and v['tbr'], all, + {functools.partial(sorted, key=lambda x: x['tbr'])}, -1, {dict})) or {} + base_url = self._search_regex( + r'(https?://[^?#]+?/)hdntl=', best_video_fmt.get('url'), 'base url', default=None) + if not base_url or '/live/' in base_url: + continue + mp4_url = base_url + replace_extension(url_basename(best_video_fmt['url']), 'mp4') + if self._request_webpage( + HEADRequest(mp4_url), video_id, 'Checking for https format', + errnote=False, fatal=False): + formats.append({ + **best_video_fmt, + 'url': mp4_url, + 'format_id': 'https-mp4', + 'protocol': 'https', + 'manifest_url': None, + 'acodec': None, + }) + else: + formats.append({ + 'url': asset_data['url'], + 'ext': ext, + 'vcodec': 'none' if self._parse_param(asset_data, 'mediaType') == 'audio' else None, + }) + + chapters = traverse_obj(data, ( + 'media', 'chapters', lambda _, v: float(v['startTime']) is not None, { + 'start_time': ('startTime', {functools.partial(float_or_none, scale=1000)}), + 'end_time': ('endTime', {functools.partial(float_or_none, scale=1000)}), + 'title': ('name', {str}), + })) + # Filter out pointless single chapters with start_time==0 and no end_time + if len(chapters) == 1 and not (chapters[0].get('start_time') or chapters[0].get('end_time')): + chapters = [] return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url( - f'http://link.theplatform.com/s/ExhSPC/media/guid/2655402169/{video_id}?mbr=true&formats=MPEG4,FLV,MP3', { - 'force_smil_url': True, - }), + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str.strip}), + 'thumbnail': ('image', 'url', {url_or_none}, {functools.partial(update_url, query=None)}), + 'timestamp': ('publishedAt', {functools.partial(float_or_none, scale=1000)}), + 'media_type': ('media', 'clipType', {str}), + 'series': ('showName', {str}), + 'season_number': ('media', 'season', {int_or_none}), + 'duration': ('media', 'duration', {float_or_none}, {lambda x: None if is_live else x}), + 'location': ('media', 'region', {str}), + 'tags': ('tags', ..., 'name', {str}), + 'genres': ('media', 'genre', all), + 'categories': ('categories', ..., 'name', {str}), + }), 'id': video_id, - '_format_sort_fields': ('res', 'proto'), # Prioritize direct http formats over HLS + 'formats': formats, + 'subtitles': subtitles, + 'chapters': chapters, + 'is_live': is_live, } From fe15d3178e242803ae7a934b90137f13598eba2e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 30 Jul 2024 04:09:55 -0500 Subject: [PATCH 70/95] [ie/learningonscreen] Add extractor (#10590) Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/common.py | 8 +-- yt_dlp/extractor/learningonscreen.py | 78 ++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 3 deletions(-) create mode 100644 yt_dlp/extractor/learningonscreen.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index dcce7ffa7..f4bd76158 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -986,6 +986,7 @@ LcpIE, LcpPlayIE, ) +from .learningonscreen import LearningOnScreenIE from .lecture2go import Lecture2GoIE from .lecturio import ( LecturioCourseIE, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index f63bd7825..187f73e7b 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -3150,7 +3150,7 @@ def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None): }) return formats, subtitles - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None, _headers=None): def absolute_url(item_url): return urljoin(base_url, item_url) @@ -3174,11 +3174,11 @@ def _media_formats(src, cur_media_type, type_info=None): formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference, quality=quality, fatal=False) + preference=preference, quality=quality, fatal=False, headers=_headers) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id, fatal=False) + full_url, video_id, mpd_id=mpd_id, fatal=False, headers=_headers) else: is_plain_url = True formats = [{ @@ -3272,6 +3272,8 @@ def _media_formats(src, cur_media_type, type_info=None): }) for f in media_info['formats']: f.setdefault('http_headers', {})['Referer'] = base_url + if _headers: + f['http_headers'].update(_headers) if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries diff --git a/yt_dlp/extractor/learningonscreen.py b/yt_dlp/extractor/learningonscreen.py new file mode 100644 index 000000000..dcf83144c --- /dev/null +++ b/yt_dlp/extractor/learningonscreen.py @@ -0,0 +1,78 @@ +import functools +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + join_nonempty, + parse_duration, + unified_timestamp, +) +from ..utils.traversal import traverse_obj + + +class LearningOnScreenIE(InfoExtractor): + _VALID_URL = r'https?://learningonscreen\.ac\.uk/ondemand/index\.php/prog/(?P\w+)' + _TESTS = [{ + 'url': 'https://learningonscreen.ac.uk/ondemand/index.php/prog/005D81B2?bcast=22757013', + 'info_dict': { + 'id': '005D81B2', + 'ext': 'mp4', + 'title': 'Planet Earth', + 'duration': 3600.0, + 'timestamp': 1164567600.0, + 'upload_date': '20061126', + 'thumbnail': 'https://stream.learningonscreen.ac.uk/trilt-cover-images/005D81B2-Planet-Earth-2006-11-26T190000Z-BBC4.jpg', + }, + }] + + def _real_initialize(self): + if not self._get_cookies('https://learningonscreen.ac.uk/').get('PHPSESSID-BOB-LIVE'): + self.raise_login_required( + 'Use --cookies for authentication. See ' + ' https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp ' + 'for how to manually pass cookies', method=None) + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + details = traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'programme-details')}, { + 'title': ({functools.partial(re.search, r'

([^<]+)

')}, 1, {clean_html}), + 'timestamp': ( + {functools.partial(get_element_by_class, 'broadcast-date')}, + {functools.partial(re.match, r'([^<]+)')}, 1, {unified_timestamp}), + 'duration': ( + {functools.partial(get_element_by_class, 'prog-running-time')}, + {clean_html}, {parse_duration}), + })) + + title = details.pop('title', None) or traverse_obj(webpage, ( + {functools.partial(get_element_html_by_id, 'add-to-existing-playlist')}, + {extract_attributes}, 'data-record-title', {clean_html})) + + entries = self._parse_html5_media_entries( + 'https://stream.learningonscreen.ac.uk', webpage, video_id, m3u8_id='hls', mpd_id='dash', + _headers={'Origin': 'https://learningonscreen.ac.uk', 'Referer': 'https://learningonscreen.ac.uk/'}) + if not entries: + raise ExtractorError('No video found') + + if len(entries) > 1: + duration = details.pop('duration', None) + for idx, entry in enumerate(entries, start=1): + entry.update(details) + entry['id'] = join_nonempty(video_id, idx) + entry['title'] = join_nonempty(title, idx) + return self.playlist_result(entries, video_id, title, duration=duration) + + return { + **entries[0], + **details, + 'id': video_id, + 'title': title, + } From 0e539617a41913c7da1edd74fb6543c10ad727b3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 30 Jul 2024 16:27:06 -0500 Subject: [PATCH 71/95] [ie/youtube] Player client maintenance (#10573) - Add clients: android_producer, android_testsuite, android_vr, tv, web_safari - Remove obsolete clients: android_embedded, ios_embedded, *_embedscreen Authored by: bashonly --- README.md | 6 +- yt_dlp/extractor/youtube.py | 231 ++++++++++++++++++++++-------------- 2 files changed, 142 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 3ed8717f0..a35efffc4 100644 --- a/README.md +++ b/README.md @@ -1758,7 +1758,7 @@ # Replace all spaces and "_" in title and uploader with a `-` # EXTRACTOR ARGUMENTS -Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=android_embedded,web;formats=incomplete" --extractor-args "funimation:version=uncut"` +Some extractors accept additional arguments which can be passed using `--extractor-args KEY:ARGS`. `ARGS` is a `;` (semicolon) separated string of `ARG=VAL1,VAL2`. E.g. `--extractor-args "youtube:player-client=mediaconnect,web;formats=incomplete" --extractor-args "funimation:version=uncut"` Note: In CLI, `ARG` can use `-` instead of `_`; e.g. `youtube:player-client"` becomes `youtube:player_client"` @@ -1767,7 +1767,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mediaconnect`, `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. The `android` clients will always be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `web_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web` is used, but `tv_embedded` and `_creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) @@ -1775,7 +1775,7 @@ #### youtube * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others -* `innertube_key`: Innertube API key to use for all API requests +* `innertube_key`: Innertube API key to use for all API requests. By default, no API key is used * `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning #### youtubetab (YouTube playlists, channels, feeds, etc.) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7364e8a2e..1a3e286c6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -72,133 +72,169 @@ # any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20220801.00.00', + 'clientVersion': '2.20240726.00.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, + }, + # Safari UA returns pre-merged video+audio 144p/240p/360p/720p/1080p HLS formats + 'web_safari': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'WEB', + 'clientVersion': '2.20240726.00.00', + 'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.5 Safari/605.1.15,gzip(gfe)', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1, }, 'web_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20220731.00.00', + 'clientVersion': '1.20240723.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56, }, 'web_music': { - 'INNERTUBE_API_KEY': 'AIzaSyC9XL3ZjWddXya6X74dJoCTL-WEYFDNX30', 'INNERTUBE_HOST': 'music.youtube.com', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20220727.01.00', + 'clientVersion': '1.20240724.00.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, }, 'web_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyBUPetSUmoZL-OhlxA7wSac5XinrygCqMo', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20220726.00.00', + 'clientVersion': '1.20240723.03.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, }, 'android': { - 'INNERTUBE_API_KEY': 'AIzaSyA8eiZmM1FaDVjRy-df2KTyQ_vz_yYM39w', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '19.09.37', + 'clientVersion': '19.29.37', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.youtube/19.29.37 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, 'REQUIRE_JS_PLAYER': False, }, - 'android_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyCjc_pVEDi4qsv5MtC2dMXzpIaDoRFLsxw', - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '19.09.37', - 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.youtube/19.09.37 (Linux; U; Android 11) gzip', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, - 'REQUIRE_JS_PLAYER': False, - }, 'android_music': { - 'INNERTUBE_API_KEY': 'AIzaSyAOghZGza2MQSZkY_zfZ370N-PUdXEo8AI', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '6.42.52', + 'clientVersion': '7.11.50', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.music/6.42.52 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.apps.youtube.music/7.11.50 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, 'REQUIRE_JS_PLAYER': False, }, 'android_creator': { - 'INNERTUBE_API_KEY': 'AIzaSyD_qjV8zaaUMehtLkrKFgVeSX_Iqbtyws8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '22.30.100', + 'clientVersion': '24.30.100', 'androidSdkVersion': 30, - 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip', + 'userAgent': 'com.google.android.apps.youtube.creator/24.30.100 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, 'REQUIRE_JS_PLAYER': False, }, + # YouTube Kids videos aren't returned on this client for some reason + 'android_vr': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_VR', + 'clientVersion': '1.57.29', + 'deviceMake': 'Oculus', + 'deviceModel': 'Quest 3', + 'androidSdkVersion': 32, + 'userAgent': 'com.google.android.apps.youtube.vr.oculus/1.57.29 (Linux; U; Android 12L; eureka-user Build/SQ3A.220605.009.A1) gzip', + 'osName': 'Android', + 'osVersion': '12L', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 28, + 'REQUIRE_JS_PLAYER': False, + }, + 'android_testsuite': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_TESTSUITE', + 'clientVersion': '1.9', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/1.9 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 30, + 'REQUIRE_JS_PLAYER': False, + 'PLAYER_PARAMS': '2AMB', + }, + # This client only has legacy formats and storyboards + 'android_producer': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'ANDROID_PRODUCER', + 'clientVersion': '0.111.1', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.producer/0.111.1 (Linux; U; Android 11) gzip', + 'osName': 'Android', + 'osVersion': '11', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 91, + 'REQUIRE_JS_PLAYER': False, + }, # iOS clients have HLS live streams. Setting device model to get 60fps formats. # See: https://github.com/TeamNewPipe/NewPipeExtractor/issues/680#issuecomment-1002724558 'ios': { - 'INNERTUBE_API_KEY': 'AIzaSyB-63vPrdThhKuerbB2N_l7Kwwcxj6yUAc', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '19.09.3', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientVersion': '19.29.1', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtube/19.29.1 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '17.5.1.21F90', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, 'REQUIRE_JS_PLAYER': False, }, - 'ios_embedded': { - 'INNERTUBE_CONTEXT': { - 'client': { - 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '19.09.3', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtube/19.09.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', - }, - }, - 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, - 'REQUIRE_JS_PLAYER': False, - }, 'ios_music': { - 'INNERTUBE_API_KEY': 'AIzaSyBAETezhkwP0ZWA02RsqT1zu78Fpt0bC_s', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '6.33.3', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.youtubemusic/6.33.3 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientVersion': '7.08.2', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.youtubemusic/7.08.2 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '17.5.1.21F90', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -208,9 +244,12 @@ 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '22.33.101', - 'deviceModel': 'iPhone14,3', - 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)', + 'clientVersion': '24.30.100', + 'deviceMake': 'Apple', + 'deviceModel': 'iPhone16,2', + 'userAgent': 'com.google.ios.ytcreator/24.30.100 (iPhone16,2; U; CPU iOS 17_5_1 like Mac OS X;)', + 'osName': 'iPhone', + 'osVersion': '17.5.1.21F90', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -219,19 +258,26 @@ # mweb has 'ultralow' formats # See: https://github.com/yt-dlp/yt-dlp/pull/557 'mweb': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20220801.00.00', + 'clientVersion': '2.20240726.01.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2, }, + 'tv': { + 'INNERTUBE_CONTEXT': { + 'client': { + 'clientName': 'TVHTML5', + 'clientVersion': '7.20240724.13.00', + }, + }, + 'INNERTUBE_CONTEXT_CLIENT_NAME': 7, + }, # This client can access age restricted videos (unless the uploader has disabled the 'allow embedding' option) # See: https://github.com/zerodytrash/YouTube-Internal-Clients 'tv_embedded': { - 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'TVHTML5_SIMPLY_EMBEDDED_PLAYER', @@ -249,6 +295,7 @@ }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 95, + 'REQUIRE_JS_PLAYER': False, }, } @@ -262,7 +309,7 @@ def _split_innertube_client(client_name): def short_client_name(client_name): - main, *parts = _split_innertube_client(client_name)[0].replace('embedscreen', 'e_s').split('_') + main, *parts = _split_innertube_client(client_name)[0].split('_') return join_nonempty(main[:4], ''.join(x[0] for x in parts)).upper() @@ -274,23 +321,18 @@ def build_innertube_clients(): priority = qualities(BASE_CLIENTS[::-1]) for client, ytcfg in tuple(INNERTUBE_CLIENTS.items()): - ytcfg.setdefault('INNERTUBE_API_KEY', 'AIzaSyDCU8hByM-4DrUqRUYnGn-3llEO78bcxq8') ytcfg.setdefault('INNERTUBE_HOST', 'www.youtube.com') ytcfg.setdefault('REQUIRE_JS_PLAYER', True) + ytcfg.setdefault('PLAYER_PARAMS', None) ytcfg['INNERTUBE_CONTEXT']['client'].setdefault('hl', 'en') _, base_client, variant = _split_innertube_client(client) ytcfg['priority'] = 10 * priority(base_client) - if not variant: - INNERTUBE_CLIENTS[f'{client}_embedscreen'] = embedscreen = copy.deepcopy(ytcfg) - embedscreen['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED' - embedscreen['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY - embedscreen['priority'] -= 3 - elif variant == 'embedded': + if variant == 'embedded': ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY ytcfg['priority'] -= 2 - else: + elif variant: ytcfg['priority'] -= 3 @@ -566,9 +608,6 @@ def _select_api_hostname(self, req_api_hostname, default_client=None): return (self._configuration_arg('innertube_host', [''], ie_key=YoutubeIE.ie_key())[0] or req_api_hostname or self._get_innertube_host(default_client or 'web')) - def _extract_api_key(self, ytcfg=None, default_client='web'): - return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], str, default_client) - def _extract_context(self, ytcfg=None, default_client='web'): context = get_first( (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) @@ -614,13 +653,15 @@ def _call_api(self, ep, query, video_id, fatal=True, headers=None, real_headers.update({'content-type': 'application/json'}) if headers: real_headers.update(headers) - api_key = (self._configuration_arg('innertube_key', [''], ie_key=YoutubeIE.ie_key(), casesense=True)[0] - or api_key or self._extract_api_key(default_client=default_client)) return self._download_json( f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key, 'prettyPrint': 'false'}) + query=filter_dict({ + 'key': self._configuration_arg( + 'innertube_key', [api_key], ie_key=YoutubeIE.ie_key(), casesense=True)[0], + 'prettyPrint': 'false', + }, cndn=lambda _, v: v)) def extract_yt_initial_data(self, item_id, webpage, fatal=True): return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) @@ -972,7 +1013,6 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers ep=ep, fatal=True, headers=headers, video_id=item_id, query=query, note=note, context=self._extract_context(ytcfg, default_client), - api_key=self._extract_api_key(ytcfg, default_client), api_hostname=api_hostname, default_client=default_client) except ExtractorError as e: if not isinstance(e.cause, network_exceptions): @@ -1295,6 +1335,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _POTOKEN_EXPERIMENTS = ('51217476', '51217102') + _BROKEN_CLIENTS = { + short_client_name(client): client + for client in ('android', 'android_creator', 'android_music') + } _GEO_BYPASS = False @@ -3661,9 +3705,10 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, 'videoId': video_id, } - pp_arg = self._configuration_arg('player_params', [None], casesense=True)[0] - if pp_arg: - yt_query['params'] = pp_arg + default_pp = traverse_obj( + INNERTUBE_CLIENTS, (_split_innertube_client(client)[0], 'PLAYER_PARAMS', {str})) + if player_params := self._configuration_arg('player_params', [default_pp], casesense=True)[0]: + yt_query['params'] = player_params yt_query.update(self._generate_player_context(sts)) return self._extract_response( @@ -3675,7 +3720,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] - android_clients = [] + broken_clients = [] default = ['ios', 'web'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), @@ -3687,18 +3732,21 @@ def _get_requested_clients(self, url, smuggled_data): requested_clients.extend(allowed_clients) elif client not in allowed_clients: self.report_warning(f'Skipping unsupported client {client}') - elif client.startswith('android'): - android_clients.append(client) + elif client in self._BROKEN_CLIENTS.values(): + broken_clients.append(client) else: requested_clients.append(client) - # Force deprioritization of broken Android clients for format de-duplication - requested_clients.extend(android_clients) + # Force deprioritization of _BROKEN_CLIENTS for format de-duplication + requested_clients.extend(broken_clients) if not requested_clients: requested_clients = default if smuggled_data.get('is_music_url') or self.is_music_url(url): - requested_clients.extend( - f'{client}_music' for client in requested_clients if f'{client}_music' in INNERTUBE_CLIENTS) + for requested_client in requested_clients: + _, base_client, variant = _split_innertube_client(requested_client) + music_client = f'{base_client}_music' + if variant != 'music' and music_client in INNERTUBE_CLIENTS: + requested_clients.append(music_client) return orderedSet(requested_clients) @@ -3793,13 +3841,12 @@ def append_client(*client_names): prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in - if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: + if variant == 'tv_embedded' and self._is_unplayable(pr) and self.is_authenticated: append_client(f'{base_client}_creator') - elif self._is_agegated(pr): - if variant == 'tv_embedded': - append_client(f'{base_client}_embedded') - elif not variant: - append_client(f'tv_embedded.{base_client}', f'{base_client}_embedded') + elif variant != 'tv_embedded' and self._is_agegated(pr): + if self.is_authenticated: + append_client(f'{base_client}_creator') + append_client(f'tv_embedded.{base_client}') if skipped_clients: self.report_warning( @@ -3935,13 +3982,13 @@ def build_fragments(f): f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) client_name = fmt.get(STREAMING_DATA_CLIENT_NAME) - # Android client formats are broken due to integrity check enforcement + # _BROKEN_CLIENTS return videoplayback URLs that expire after 30 seconds # Ref: https://github.com/yt-dlp/yt-dlp/issues/9554 - is_broken = client_name and client_name.startswith(short_client_name('android')) + is_broken = client_name in self._BROKEN_CLIENTS if is_broken: self.report_warning( - f'{video_id}: Android client formats are broken and may yield HTTP Error 403. ' - 'They will be deprioritized', only_once=True) + f'{video_id}: {self._BROKEN_CLIENTS[client_name]} client formats are broken ' + 'and may yield HTTP Error 403. They will be deprioritized', only_once=True) name = fmt.get('qualityLabel') or quality.replace('audio_quality_', '') or '' fps = int_or_none(fmt.get('fps')) or 0 From 4b69e1b53ea21e631cd5dd68ff531e2f1671ec17 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 30 Jul 2024 18:17:05 -0500 Subject: [PATCH 72/95] [ie/mlbtv] Fix makeup game extraction (#10607) Closes #10606 Authored by: bashonly --- yt_dlp/extractor/mlb.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/mlb.py b/yt_dlp/extractor/mlb.py index 230c218e7..935bf8561 100644 --- a/yt_dlp/extractor/mlb.py +++ b/yt_dlp/extractor/mlb.py @@ -290,9 +290,18 @@ class MLBTVIE(InfoExtractor): 'release_date': '20220702', 'release_timestamp': 1656792300, }, - 'params': { - 'skip_download': True, + 'params': {'skip_download': 'm3u8'}, + }, { + # makeup game: has multiple dates, need to avoid games with 'rescheduleDate' + 'url': 'https://www.mlb.com/tv/g747039/vd22541c4-5a29-45f7-822b-635ec041cf5e', + 'info_dict': { + 'id': '747039', + 'ext': 'mp4', + 'title': '2024-07-29 - Toronto Blue Jays @ Baltimore Orioles', + 'release_date': '20240729', + 'release_timestamp': 1722280200, }, + 'params': {'skip_download': 'm3u8'}, }] _GRAPHQL_INIT_QUERY = '''\ mutation initSession($device: InitSessionInput!, $clientType: ClientType!, $experience: ExperienceTypeInput) { @@ -463,11 +472,14 @@ def _extract_formats_and_subtitles(self, broadcast, video_id): def _real_extract(self, url): video_id = self._match_id(url) - metadata = traverse_obj(self._download_json( + data = self._download_json( 'https://statsapi.mlb.com/api/v1/schedule', video_id, query={ 'gamePk': video_id, 'hydrate': 'broadcasts(all),statusFlags', - }), ('dates', ..., 'games', lambda _, v: str(v['gamePk']) == video_id and v['broadcasts'], any)) + }) + metadata = traverse_obj(data, ( + 'dates', ..., 'games', + lambda _, v: str(v['gamePk']) == video_id and not v.get('rescheduleDate'), any)) broadcasts = traverse_obj(metadata, ( 'broadcasts', lambda _, v: v['mediaId'] and v['mediaState']['mediaStateCode'] != 'MEDIA_OFF')) From 2f1ddfe12a2c174bc777264c5c8ffe7ca0922d94 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 30 Jul 2024 20:50:20 -0500 Subject: [PATCH 73/95] [ie/olympics] Fix extractor (#10604) Closes #10592 Authored by: bashonly --- yt_dlp/extractor/olympics.py | 106 ++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py index becf052f6..a50c510cb 100644 --- a/yt_dlp/extractor/olympics.py +++ b/yt_dlp/extractor/olympics.py @@ -1,9 +1,17 @@ from .common import InfoExtractor -from ..utils import int_or_none, try_get +from ..networking.exceptions import HTTPError +from ..utils import ( + ExtractorError, + int_or_none, + parse_iso8601, + try_get, + url_or_none, +) +from ..utils.traversal import traverse_obj class OlympicsReplayIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P[^/#&?]+)' + _VALID_URL = r'https?://(?:www\.)?olympics\.com/[a-z]{2}/(?:paris-2024/)?(?:replay|videos?|original-series/episode)/(?P[\w-]+)' _TESTS = [{ 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', 'info_dict': { @@ -11,26 +19,98 @@ class OlympicsReplayIE(InfoExtractor): 'ext': 'mp4', 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', 'upload_date': '20210801', - 'timestamp': 1627783200, + 'timestamp': 1627797600, 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', - 'uploader': 'International Olympic Committee', + 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/nua4o7zwyaznoaejpbk2', + 'duration': 7017.0, }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', - 'only_matching': True, + 'url': 'https://olympics.com/en/original-series/episode/b-boys-and-b-girls-take-the-spotlight-breaking-life-road-to-paris-2024', + 'info_dict': { + 'id': '32633650-c5ee-4280-8b94-fb6defb6a9b5', + 'ext': 'mp4', + 'title': 'B-girl Nicka - Breaking Life, Road to Paris 2024 | Episode 1', + 'upload_date': '20240517', + 'timestamp': 1715948200, + 'description': 'md5:f63d728a41270ec628f6ac33ce471bb1', + 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/a3j96l7j6so3vyfijby1', + 'duration': 1321.0, + }, + }, { + 'url': 'https://olympics.com/en/paris-2024/videos/men-s-preliminaries-gbr-esp-ned-rsa-hockey-olympic-games-paris-2024', + 'info_dict': { + 'id': '3d96db23-8eee-4b7c-8ef5-488a0361026c', + 'ext': 'mp4', + 'title': 'Men\'s Preliminaries GBR-ESP & NED-RSA | Hockey | Olympic Games Paris 2024', + 'upload_date': '20240727', + 'timestamp': 1722066600, + }, + 'skip': 'Geo-restricted to RU, BR, BT, NP, TM, BD, TL', + }, { + 'url': 'https://olympics.com/en/paris-2024/videos/dnp-suni-lee-i-have-goals-and-i-have-expectations-for-myself-but-i-also-am-trying-to-give-myself-grace', + 'info_dict': { + 'id': 'a42f37ab-8a74-41d0-a7d9-af27b7b02a90', + 'ext': 'mp4', + 'title': 'md5:c7cfbc9918636a98e66400a812e4d407', + 'upload_date': '20240729', + 'timestamp': 1722288600, + }, }] + _GEO_BYPASS = False + + def _extract_from_nextjs_data(self, webpage, video_id): + data = traverse_obj(self._search_nextjs_data(webpage, video_id, default={}), ( + 'props', 'pageProps', 'page', 'items', + lambda _, v: v['name'] == 'videoPlaylist', 'data', 'currentVideo', {dict}, any)) + if not data: + return None + + geo_countries = traverse_obj(data, ('countries', ..., {str})) + if traverse_obj(data, ('geoRestrictedVideo', {bool})): + self.raise_geo_restricted(countries=geo_countries) + + is_live = traverse_obj(data, ('streamingStatus', {str})) == 'LIVE' + m3u8_url = traverse_obj(data, ('videoUrl', {url_or_none})) or data['streamUrl'] + tokenized_url = m3u8_url if is_live else self._tokenize_url(m3u8_url, video_id) + + try: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + tokenized_url, video_id, 'mp4', m3u8_id='hls') + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and 'georestricted' in e.cause.msg: + self.raise_geo_restricted(countries=geo_countries) + raise + + return { + 'formats': formats, + 'subtitles': subtitles, + 'is_live': is_live, + **traverse_obj(data, { + 'id': ('videoID', {str}), + 'title': ('title', {str}), + 'timestamp': ('contentDate', {parse_iso8601}), + }), + } + + def _tokenize_url(self, url, video_id): + return self._download_json( + 'https://olympics.com/tokenGenerator', video_id, + 'Downloading tokenized m3u8 url', query={'url': url}) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + if info := self._extract_from_nextjs_data(webpage, video_id): + return info + title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) - uuid = self._html_search_meta('episode_uid', webpage) + video_uuid = self._html_search_meta('episode_uid', webpage) m3u8_url = self._html_search_meta('video_url', webpage) - json_ld = self._search_json_ld(webpage, uuid) + json_ld = self._search_json_ld(webpage, video_uuid) thumbnails_list = json_ld.get('image') if not thumbnails_list: thumbnails_list = self._html_search_regex( @@ -48,12 +128,12 @@ def _real_extract(self, url): 'width': width, 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)), }) - m3u8_url = self._download_json( - f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, 'mp4', m3u8_id='hls') + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + self._tokenize_url(m3u8_url, video_uuid), video_uuid, 'mp4', m3u8_id='hls') return { - 'id': uuid, + 'id': video_uuid, 'title': title, 'thumbnails': thumbnails, 'formats': formats, From 5260696b1cba77161828941fdb38f09f14ac6c60 Mon Sep 17 00:00:00 2001 From: vvto33 <54504675+vvto33@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:18:43 +0900 Subject: [PATCH 74/95] [ie/tver] Support olympic URLs (#10600) Closes #10583 Authored by: vvto33 --- yt_dlp/extractor/tver.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 8105db41c..c13832c6f 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -10,7 +10,7 @@ class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature|tokyo2020/video)/)+(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?Plp|corner|series|episodes?|feature|tokyo2020/video|olympic/paris2024/video)/)+(?P[a-zA-Z0-9]+)' _TESTS = [{ 'skip': 'videos are only available for 7 days', 'url': 'https://tver.jp/episodes/ep83nf3w4p', @@ -23,6 +23,20 @@ class TVerIE(InfoExtractor): 'channel': 'テレビ朝日', }, 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'https://tver.jp/olympic/paris2024/video/6359578055112/', + 'info_dict': { + 'id': '6359578055112', + 'ext': 'mp4', + 'title': '堀米雄斗 金メダルで五輪連覇!「みんなの応援が最後に乗れたカギ」', + 'timestamp': 1722279928, + 'upload_date': '20240729', + 'tags': ['20240729', 'japanese', 'japanmedal', 'paris'], + 'uploader_id': '4774017240001', + 'thumbnail': r're:https?://[^/?#]+boltdns\.net/[^?#]+/1920x1080/match/image\.jpg', + 'duration': 670.571, + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, @@ -47,7 +61,15 @@ def _real_initialize(self): def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') - if video_type not in {'series', 'episodes'}: + + if video_type == 'olympic/paris2024/video': + # Player ID is taken from .content.brightcove.E200.pro.pc.account_id: + # https://tver.jp/olympic/paris2024/req/api/hook?q=https%3A%2F%2Folympic-assets.tver.jp%2Fweb-static%2Fjson%2Fconfig.json&d= + return self.url_result(smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % ('4774017240001', video_id), + {'geo_countries': ['JP']}), 'BrightcoveNew') + + elif video_type not in {'series', 'episodes'}: webpage = self._download_webpage(url, video_id, note='Resolving to new URL') video_id = self._match_id(self._search_regex( (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), From 7e3e4779ad13e4511c9ba3869879e53f0267bd7a Mon Sep 17 00:00:00 2001 From: szantnerb <2652078+szantnerb@users.noreply.github.com> Date: Wed, 31 Jul 2024 04:22:44 +0200 Subject: [PATCH 75/95] [ie/mediaklikk] Fix extractor (#10605) Closes #10588 Authored by: szantnerb --- yt_dlp/extractor/mediaklikk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index bd1a27fcc..f51342060 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -133,7 +133,9 @@ def _real_extract(self, url): r']+\bclass="article_date">([^<]+)<', webpage, 'upload date', default=None)) player_data['video'] = player_data.pop('token') - player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) + player_page = self._download_webpage( + 'https://player.mediaklikk.hu/playernew/player.php', video_id, + query=player_data, headers={'Referer': url}) player_json = self._search_json( r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);') playlist_url = traverse_obj( From 011b4a04db2a636c3ef0a0ad4e2d3ae482c9fd76 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:19:30 -0500 Subject: [PATCH 76/95] [ie/youtube] Fix `n` function name extraction for player `20dfca59` (#10611) Closes #10608 Authored by: bashonly --- test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 26 ++++++++++++++++++++------ 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index ae167d16d..d37df7a2e 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -175,6 +175,10 @@ 'https://www.youtube.com/s/player/3400486c/player_ias.vflset/en_US/base.js', 'lL46g3XifCKUZn1Xfw', 'z767lhet6V2Skl', ), + ( + 'https://www.youtube.com/s/player/20dfca59/player_ias.vflset/en_US/base.js', + '-fLCxedkAk4LUTK2', 'O8kfRq1y1eyHGw', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1a3e286c6..4993ce397 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3174,18 +3174,32 @@ def _decrypt_nsig(self, s, video_id, player_url): return ret def _extract_n_function_name(self, jscode): + # Examples (with placeholders nfunc, narray, idx): + # * .get("n"))&&(b=nfunc(b) + # * .get("n"))&&(b=narray[idx](b) + # * b=String.fromCharCode(110),c=a.get(b))&&c=narray[idx](c) + # * a.D&&(b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") + # * a.D&&(PL(a),b=a.j.n||null)&&(b=narray[0](b),a.set("n",b),narray.length||nfunc("") funcname, idx = self._search_regex( r'''(?x) (?: \.get\("n"\)\)&&\(b=| (?: b=String\.fromCharCode\(110\)| - ([a-zA-Z0-9$.]+)&&\(b="nn"\[\+\1\] - ),c=a\.get\(b\)\)&&\(c= - ) - (?P[a-zA-Z0-9$]+)(?:\[(?P\d+)\])?\([a-zA-Z0-9]\)''', - jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) - if not idx: + (?P[a-zA-Z0-9_$.]+)&&\(b="nn"\[\+(?P=str_idx)\] + ),c=a\.get\(b\)\)&&\(c=| + \b(?P[a-zA-Z0-9_$]+)= + )(?P[a-zA-Z0-9_$]+)(?:\[(?P\d+)\])?\([a-zA-Z]\) + (?(var),[a-zA-Z0-9_$]+\.set\("n"\,(?P=var)\),(?P=nfunc)\.length)''', + jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) + if not funcname: + self.report_warning('Falling back to generic n function search') + return self._search_regex( + r'''(?xs) + ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) + \s*\{(?:(?!};).)+?["']enhanced_except_''', + jscode, 'Initial JS player n function name', group='name') + elif not idx: return funcname return json.loads(js_to_json(self._search_regex( From d19fcb934269465fd707e68a87f735ec6983e93d Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 31 Jul 2024 16:39:36 -0500 Subject: [PATCH 77/95] [ie/youtube] Fix age-verification workaround (#10610) Authored by: bashonly, Grub4K Co-authored-by: Simon Sawicki --- README.md | 2 +- yt_dlp/extractor/youtube.py | 26 ++++++++++++++++++++------ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a35efffc4..ea3fad1c1 100644 --- a/README.md +++ b/README.md @@ -1767,7 +1767,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `web_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web` is used, but `tv_embedded` and `_creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web` is used, but `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4993ce397..b20dfda41 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3854,14 +3854,28 @@ def append_client(*client_names): f[STREAMING_DATA_CLIENT_NAME] = name prs.append(pr) - # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in - if variant == 'tv_embedded' and self._is_unplayable(pr) and self.is_authenticated: - append_client(f'{base_client}_creator') - elif variant != 'tv_embedded' and self._is_agegated(pr): - if self.is_authenticated: - append_client(f'{base_client}_creator') + # tv_embedded can work around age-gate and age-verification IF the video is embeddable + if self._is_agegated(pr) and variant != 'tv_embedded': append_client(f'tv_embedded.{base_client}') + # Unauthenticated users will only get tv_embedded client formats if age-gated + if self._is_agegated(pr) and not self.is_authenticated: + self.to_screen( + f'{video_id}: This video is age-restricted; some formats may be missing ' + f'without authentication. {self._login_hint()}', only_once=True) + + # EU countries require age-verification for accounts to access age-restricted videos + # If account is not age-verified, _is_agegated() will be truthy for non-embedded clients + # If embedding is disabled for the video, _is_unplayable() will be truthy for tv_embedded + embedding_is_disabled = variant == 'tv_embedded' and self._is_unplayable(pr) + if self.is_authenticated and (self._is_agegated(pr) or embedding_is_disabled): + self.to_screen( + f'{video_id}: This video is age-restricted and YouTube is requiring ' + 'account age-verification; some formats may be missing', only_once=True) + # web_creator and mediaconnect can work around the age-verification requirement + # _producer, _testsuite, & _vr variants can also work around age-verification + append_client('web_creator', 'mediaconnect') + if skipped_clients: self.report_warning( f'Skipping player responses from {"/".join(skipped_clients)} clients ' From bb3936ae2b3ce96d0b53f9e17cad1082058f032b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?O=C4=9Fulcan=20Tokar?= <42005993+luvyana@users.noreply.github.com> Date: Thu, 1 Aug 2024 03:00:52 +0300 Subject: [PATCH 78/95] [ie/kick:clips] Add extractor (#10572) Closes #8115 Authored by: luvyana --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/kick.py | 205 +++++++++++++++++++++++--------- 2 files changed, 150 insertions(+), 56 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f4bd76158..9b73fcd75 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -939,6 +939,7 @@ KhanAcademyUnitIE, ) from .kick import ( + KickClipIE, KickIE, KickVODIE, ) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 889548f52..1c1b2a177 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -1,9 +1,14 @@ +import functools + from .common import InfoExtractor from ..networking import HEADRequest from ..utils import ( UserNotLive, + determine_ext, float_or_none, + int_or_none, merge_dicts, + parse_iso8601, str_or_none, traverse_obj, unified_timestamp, @@ -25,104 +30,192 @@ def _real_initialize(self): def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): return self._download_json( - f'https://kick.com/api/v1/{path}', display_id, note=note, + f'https://kick.com/api/{path}', display_id, note=note, headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs) class KickIE(KickBaseIE): + IE_NAME = 'kick:live' _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P[\w-]+)' _TESTS = [{ - 'url': 'https://kick.com/yuppy', + 'url': 'https://kick.com/buddha', 'info_dict': { - 'id': '6cde1-kickrp-joe-flemmingskick-info-heremust-knowmust-see21', + 'id': '92722911-nopixel-40', 'ext': 'mp4', 'title': str, 'description': str, - 'channel': 'yuppy', - 'channel_id': '33538', - 'uploader': 'Yuppy', - 'uploader_id': '33793', - 'upload_date': str, - 'live_status': 'is_live', 'timestamp': int, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:https?://.+\.jpg', 'categories': list, + 'upload_date': str, + 'channel': 'buddha', + 'channel_id': '32807', + 'uploader': 'Buddha', + 'uploader_id': '33057', + 'live_status': 'is_live', + 'concurrent_view_count': int, + 'release_timestamp': int, + 'age_limit': 18, + 'release_date': str, }, - 'skip': 'livestream', + 'params': {'skip_download': 'livestream'}, + # 'skip': 'livestream', }, { - 'url': 'https://kick.com/kmack710', + 'url': 'https://kick.com/xqc', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if KickClipIE.suitable(url) else super().suitable(url) + def _real_extract(self, url): channel = self._match_id(url) - response = self._call_api(f'channels/{channel}', channel) + response = self._call_api(f'v2/channels/{channel}', channel) if not traverse_obj(response, 'livestream', expected_type=dict): raise UserNotLive(video_id=channel) return { - 'id': str(traverse_obj( - response, ('livestream', ('slug', 'id')), get_all=False, default=channel)), - 'formats': self._extract_m3u8_formats( - response['playback_url'], channel, 'mp4', live=True), - 'title': traverse_obj( - response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), - 'description': traverse_obj(response, ('user', 'bio')), 'channel': channel, - 'channel_id': str_or_none(traverse_obj(response, 'id', ('livestream', 'channel_id'))), - 'uploader': traverse_obj(response, 'name', ('user', 'username')), - 'uploader_id': str_or_none(traverse_obj(response, 'user_id', ('user', 'id'))), 'is_live': True, - 'timestamp': unified_timestamp(traverse_obj(response, ('livestream', 'created_at'))), - 'thumbnail': traverse_obj( - response, ('livestream', 'thumbnail', 'url'), expected_type=url_or_none), - 'categories': traverse_obj(response, ('recent_categories', ..., 'name')), + 'formats': self._extract_m3u8_formats(response['playback_url'], channel, 'mp4', live=True), + **traverse_obj(response, { + 'id': ('livestream', 'slug', {str}), + 'title': ('livestream', 'session_title', {str}), + 'description': ('user', 'bio', {str}), + 'channel_id': (('id', ('livestream', 'channel_id')), {int}, {str_or_none}, any), + 'uploader': (('name', ('user', 'username')), {str}, any), + 'uploader_id': (('user_id', ('user', 'id')), {int}, {str_or_none}, any), + 'timestamp': ('livestream', 'created_at', {unified_timestamp}), + 'release_timestamp': ('livestream', 'start_time', {unified_timestamp}), + 'thumbnail': ('livestream', 'thumbnail', 'url', {url_or_none}), + 'categories': ('recent_categories', ..., 'name', {str}), + 'concurrent_view_count': ('livestream', 'viewer_count', {int_or_none}), + 'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}), + }), } class KickVODIE(KickBaseIE): + IE_NAME = 'kick:vod' _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3', + 'url': 'https://kick.com/video/e74614f4-5270-4319-90ad-32179f19a45c', 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': '58bac65b-e641-4476-a7ba-3707a35e60e3', + 'id': 'e74614f4-5270-4319-90ad-32179f19a45c', 'ext': 'mp4', - 'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠', - 'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d', - 'channel': 'jaredfps', - 'channel_id': '26608', - 'uploader': 'JaredFPS', - 'uploader_id': '26799', - 'upload_date': '20240402', - 'timestamp': 1712097108, - 'duration': 33859.0, + 'title': r're:❎ MEGA DRAMA ❎ LIVE ❎ CLICK ❎ ULTIMATE SKILLS .+', + 'description': 'THE BEST AT ABSOLUTELY EVERYTHING. THE JUICER. LEADER OF THE JUICERS.', + 'channel': 'xqc', + 'channel_id': '668', + 'uploader': 'xQc', + 'uploader_id': '676', + 'upload_date': '20240724', + 'timestamp': 1721796562, + 'duration': 18566.0, 'thumbnail': r're:^https?://.*\.jpg', - 'categories': ['Call of Duty: Warzone'], + 'view_count': int, + 'categories': ['VALORANT'], + 'age_limit': 0, }, - 'params': { - 'skip_download': 'm3u8', - }, - 'expected_warnings': [r'impersonation'], + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): video_id = self._match_id(url) - response = self._call_api(f'video/{video_id}', video_id) + response = self._call_api(f'v1/video/{video_id}', video_id) return { 'id': video_id, 'formats': self._extract_m3u8_formats(response['source'], video_id, 'mp4'), - 'title': traverse_obj( - response, ('livestream', ('session_title', 'slug')), get_all=False, default=''), - 'description': traverse_obj(response, ('livestream', 'channel', 'user', 'bio')), - 'channel': traverse_obj(response, ('livestream', 'channel', 'slug')), - 'channel_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'id'))), - 'uploader': traverse_obj(response, ('livestream', 'channel', 'user', 'username')), - 'uploader_id': str_or_none(traverse_obj(response, ('livestream', 'channel', 'user_id'))), - 'timestamp': unified_timestamp(response.get('created_at')), - 'duration': float_or_none(traverse_obj(response, ('livestream', 'duration')), scale=1000), - 'thumbnail': traverse_obj( - response, ('livestream', 'thumbnail'), expected_type=url_or_none), - 'categories': traverse_obj(response, ('livestream', 'categories', ..., 'name')), + **traverse_obj(response, { + 'title': ('livestream', ('session_title', 'slug'), {str}, any), + 'description': ('livestream', 'channel', 'user', 'bio', {str}), + 'channel': ('livestream', 'channel', 'slug', {str}), + 'channel_id': ('livestream', 'channel', 'id', {int}, {str_or_none}), + 'uploader': ('livestream', 'channel', 'user', 'username', {str}), + 'uploader_id': ('livestream', 'channel', 'user_id', {int}, {str_or_none}), + 'timestamp': ('created_at', {parse_iso8601}), + 'duration': ('livestream', 'duration', {functools.partial(float_or_none, scale=1000)}), + 'thumbnail': ('livestream', 'thumbnail', {url_or_none}), + 'categories': ('livestream', 'categories', ..., 'name', {str}), + 'view_count': ('views', {int_or_none}), + 'age_limit': ('livestream', 'is_mature', {bool}, {lambda x: 18 if x else 0}), + }), + } + + +class KickClipIE(KickBaseIE): + IE_NAME = 'kick:clips' + _VALID_URL = r'https?://(?:www\.)?kick\.com/[\w-]+/?\?(?:[^#]+&)?clip=(?Pclip_[\w-]+)' + _TESTS = [{ + 'url': 'https://kick.com/mxddy?clip=clip_01GYXVB5Y8PWAPWCWMSBCFB05X', + 'info_dict': { + 'id': 'clip_01GYXVB5Y8PWAPWCWMSBCFB05X', + 'ext': 'mp4', + 'title': 'Maddy detains Abd D:', + 'channel': 'mxddy', + 'channel_id': '133789', + 'uploader': 'AbdCreates', + 'uploader_id': '3309077', + 'thumbnail': r're:^https?://.*\.jpeg', + 'duration': 35, + 'timestamp': 1682481453, + 'upload_date': '20230426', + 'view_count': int, + 'like_count': int, + 'categories': ['VALORANT'], + 'age_limit': 18, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://kick.com/destiny?clip=clip_01H9SKET879NE7N9RJRRDS98J3', + 'info_dict': { + 'id': 'clip_01H9SKET879NE7N9RJRRDS98J3', + 'title': 'W jews', + 'ext': 'mp4', + 'channel': 'destiny', + 'channel_id': '1772249', + 'uploader': 'punished_furry', + 'uploader_id': '2027722', + 'duration': 49.0, + 'upload_date': '20230908', + 'timestamp': 1694150180, + 'thumbnail': 'https://clips.kick.com/clips/j3/clip_01H9SKET879NE7N9RJRRDS98J3/thumbnail.png', + 'view_count': int, + 'like_count': int, + 'categories': ['Just Chatting'], + 'age_limit': 0, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + clip_id = self._match_id(url) + clip = self._call_api(f'v2/clips/{clip_id}/play', clip_id)['clip'] + clip_url = clip['clip_url'] + + if determine_ext(clip_url) == 'm3u8': + formats = self._extract_m3u8_formats(clip_url, clip_id, 'mp4') + else: + formats = [{'url': clip_url}] + + return { + 'id': clip_id, + 'formats': formats, + **traverse_obj(clip, { + 'title': ('title', {str}), + 'channel': ('channel', 'slug', {str}), + 'channel_id': ('channel', 'id', {int}, {str_or_none}), + 'uploader': ('creator', 'username', {str}), + 'uploader_id': ('creator', 'id', {int}, {str_or_none}), + 'thumbnail': ('thumbnail_url', {url_or_none}), + 'duration': ('duration', {float_or_none}), + 'categories': ('category', 'name', {str}, all), + 'timestamp': ('created_at', {parse_iso8601}), + 'view_count': ('views', {int_or_none}), + 'like_count': ('likes', {int_or_none}), + 'age_limit': ('is_mature', {bool}, {lambda x: 18 if x else 0}), + }), } From efb42763dec23ccf6a2e3bac3afbfefce8efd012 Mon Sep 17 00:00:00 2001 From: sepro Date: Thu, 1 Aug 2024 16:03:03 +0200 Subject: [PATCH 79/95] [ie/youtube] Change default player clients to `ios,tv` (#10457) Closes #10046 Authored by: seproDev --- README.md | 2 +- yt_dlp/extractor/youtube.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ea3fad1c1..dd78012a8 100644 --- a/README.md +++ b/README.md @@ -1767,7 +1767,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web` is used, but `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,tv` is used, but `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index b20dfda41..c56358288 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3735,7 +3735,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] broken_clients = [] - default = ['ios', 'web'] + default = ['ios', 'tv'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) From ffd7781d6588926f820b44a34b9e6e3068fb9f97 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Aug 2024 10:03:49 -0500 Subject: [PATCH 80/95] [cleanup] Misc (#10623) Authored by: bashonly --- yt_dlp/extractor/youtube.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c56358288..88e1a28ae 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3173,7 +3173,7 @@ def _decrypt_nsig(self, s, video_id, player_url): self.write_debug(f'Decrypted nsig {s} => {ret}') return ret - def _extract_n_function_name(self, jscode): + def _extract_n_function_name(self, jscode, player_url=None): # Examples (with placeholders nfunc, narray, idx): # * .get("n"))&&(b=nfunc(b) # * .get("n"))&&(b=narray[idx](b) @@ -3193,7 +3193,9 @@ def _extract_n_function_name(self, jscode): (?(var),[a-zA-Z0-9_$]+\.set\("n"\,(?P=var)\),(?P=nfunc)\.length)''', jscode, 'n function name', group=('nfunc', 'idx'), default=(None, None)) if not funcname: - self.report_warning('Falling back to generic n function search') + self.report_warning(join_nonempty( + 'Falling back to generic n function search', + player_url and f' player = {player_url}', delim='\n')) return self._search_regex( r'''(?xs) ;\s*(?P[a-zA-Z0-9_$]+)\s*=\s*function\([a-zA-Z0-9_$]+\) @@ -3215,7 +3217,7 @@ def _extract_n_function_code(self, video_id, player_url): if func_code: return jsi, player_id, func_code - func_name = self._extract_n_function_name(jscode) + func_name = self._extract_n_function_name(jscode, player_url=player_url) func_code = jsi.extract_function_code(func_name) From abe10131fc235b7cc7af39f833e417f4264c1fdb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 1 Aug 2024 15:11:19 +0000 Subject: [PATCH 81/95] Release 2024.08.01 Created by: bashonly :ci skip all :ci run dl --- CONTRIBUTORS | 2 ++ Changelog.md | 26 ++++++++++++++++++++++++++ supportedsites.md | 9 +++++---- yt_dlp/version.py | 6 +++--- 4 files changed, 36 insertions(+), 7 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 01c07aab9..2180ecfe2 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -653,3 +653,5 @@ LeSuisse DunnesH iancmy mokrueger +luvyana +szantnerb diff --git a/Changelog.md b/Changelog.md index b2cad7dc4..73bf828a6 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,32 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.08.01 + +#### Core changes +- **utils**: `unified_timestamp`: [Recognize Sunday](https://github.com/yt-dlp/yt-dlp/commit/6daf2c27c0464fba98337be30de0b66d520d0db1) ([#10589](https://github.com/yt-dlp/yt-dlp/issues/10589)) by [bashonly](https://github.com/bashonly) + +#### Extractor changes +- **abematv**: [Fix availability extraction](https://github.com/yt-dlp/yt-dlp/commit/ef36d517f9b05785d61abca7691d9ab7d63cc75c) ([#10569](https://github.com/yt-dlp/yt-dlp/issues/10569)) by [middlingphys](https://github.com/middlingphys) +- **cbc.ca**: player: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/94a1c5e642e468cebeb51f74c6c220434cb47d96) ([#10302](https://github.com/yt-dlp/yt-dlp/issues/10302)) by [bashonly](https://github.com/bashonly), [trainman261](https://github.com/trainman261) +- **discoveryplus**: [Support olympics URLs](https://github.com/yt-dlp/yt-dlp/commit/0b7728618417e1aa382722a4d29b916b594d4459) ([#10566](https://github.com/yt-dlp/yt-dlp/issues/10566)) by [bashonly](https://github.com/bashonly) +- **kick**: clips: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bb3936ae2b3ce96d0b53f9e17cad1082058f032b) ([#10572](https://github.com/yt-dlp/yt-dlp/issues/10572)) by [luvyana](https://github.com/luvyana) +- **learningonscreen**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/fe15d3178e242803ae7a934b90137f13598eba2e) ([#10590](https://github.com/yt-dlp/yt-dlp/issues/10590)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7e3e4779ad13e4511c9ba3869879e53f0267bd7a) ([#10605](https://github.com/yt-dlp/yt-dlp/issues/10605)) by [szantnerb](https://github.com/szantnerb) +- **mlbtv**: [Fix makeup game extraction](https://github.com/yt-dlp/yt-dlp/commit/4b69e1b53ea21e631cd5dd68ff531e2f1671ec17) ([#10607](https://github.com/yt-dlp/yt-dlp/issues/10607)) by [bashonly](https://github.com/bashonly) +- **olympics**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2f1ddfe12a2c174bc777264c5c8ffe7ca0922d94) ([#10604](https://github.com/yt-dlp/yt-dlp/issues/10604)) by [bashonly](https://github.com/bashonly) +- **tva**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/28d485714fef88937c82635438afba5db81f9089) ([#10567](https://github.com/yt-dlp/yt-dlp/issues/10567)) by [bashonly](https://github.com/bashonly) +- **tver**: [Support olympic URLs](https://github.com/yt-dlp/yt-dlp/commit/5260696b1cba77161828941fdb38f09f14ac6c60) ([#10600](https://github.com/yt-dlp/yt-dlp/issues/10600)) by [vvto33](https://github.com/vvto33) +- **vimeo**: review: [Fix password-protected video extraction](https://github.com/yt-dlp/yt-dlp/commit/2b6df93a243bdfb9d6bb5c1e18020625cd02d465) ([#10598](https://github.com/yt-dlp/yt-dlp/issues/10598)) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Change default player clients to `ios,tv`](https://github.com/yt-dlp/yt-dlp/commit/efb42763dec23ccf6a2e3bac3afbfefce8efd012) ([#10457](https://github.com/yt-dlp/yt-dlp/issues/10457)) by [seproDev](https://github.com/seproDev) + - [Fix `n` function name extraction for player `20dfca59`](https://github.com/yt-dlp/yt-dlp/commit/011b4a04db2a636c3ef0a0ad4e2d3ae482c9fd76) ([#10611](https://github.com/yt-dlp/yt-dlp/issues/10611)) by [bashonly](https://github.com/bashonly) + - [Fix age-verification workaround](https://github.com/yt-dlp/yt-dlp/commit/d19fcb934269465fd707e68a87f735ec6983e93d) ([#10610](https://github.com/yt-dlp/yt-dlp/issues/10610)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Player client maintenance](https://github.com/yt-dlp/yt-dlp/commit/0e539617a41913c7da1edd74fb6543c10ad727b3) ([#10573](https://github.com/yt-dlp/yt-dlp/issues/10573)) by [bashonly](https://github.com/bashonly) + +#### Misc. changes +- **cleanup**: Miscellaneous: [ffd7781](https://github.com/yt-dlp/yt-dlp/commit/ffd7781d6588926f820b44a34b9e6e3068fb9f97) by [bashonly](https://github.com/bashonly) + ### 2024.07.25 #### Extractor changes diff --git a/supportedsites.md b/supportedsites.md index c8b8fbb35..e3bbe03ec 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -655,10 +655,11 @@ # Supported sites - **Ketnet** - **khanacademy** - **khanacademy:unit** - - **Kick** + - **kick:clips** + - **kick:live** + - **kick:vod** - **Kicker** - **KickStarter** - - **KickVOD** - **kinja:embed** - **KinoPoisk** - **Kommunetv** @@ -690,6 +691,7 @@ # Supported sites - **Lcp** - **LcpPlay** - **Le**: 乐视网 + - **LearningOnScreen** - **Lecture2Go**: (**Currently broken**) - **Lecturio**: [*lecturio*](## "netrc machine") - **LecturioCourse**: [*lecturio*](## "netrc machine") @@ -1140,7 +1142,6 @@ # Supported sites - **QuantumTV**: [*quantumtv*](## "netrc machine") - **QuantumTVLive**: [*quantumtv*](## "netrc machine") - **QuantumTVRecordings**: [*quantumtv*](## "netrc machine") - - **Qub** - **R7**: (**Currently broken**) - **R7Article**: (**Currently broken**) - **Radiko** @@ -1517,9 +1518,9 @@ # Supported sites - **tv5unis** - **tv5unis:video** - **tv8.it** - - **TVA** - **TVANouvelles** - **TVANouvellesArticle** + - **tvaplus**: TVA+ - **TVC** - **TVCArticle** - **TVer** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index e641bf5ae..81d1c2c96 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.07.25' +__version__ = '2024.08.01' -RELEASE_GIT_HEAD = 'f0993391e6052ec8f7aacc286609564f226943b9' +RELEASE_GIT_HEAD = 'ffd7781d6588926f820b44a34b9e6e3068fb9f97' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.07.25' +_pkg_version = '2024.08.01' From 0088c6de23d832b117061a33e984dc452d992e9c Mon Sep 17 00:00:00 2001 From: hugepower Date: Fri, 2 Aug 2024 00:40:46 +0800 Subject: [PATCH 82/95] [ie/youku] Fix extractor (#10626) Closes #10549 Authored by: hugepower --- yt_dlp/extractor/youku.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youku.py b/yt_dlp/extractor/youku.py index fa6b0539b..3bdfa6c93 100644 --- a/yt_dlp/extractor/youku.py +++ b/yt_dlp/extractor/youku.py @@ -136,7 +136,7 @@ def _real_extract(self, url): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0524', + 'ccode': '0564', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, From 919540a9644e55deb78cdd6751757ec8fdaf76f4 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 1 Aug 2024 15:25:46 -0500 Subject: [PATCH 83/95] [ie/olympics] Fix extraction (#10625) Bugfix for 2f1ddfe12a2c174bc777264c5c8ffe7ca0922d94 Closes #10592 Authored by: bashonly --- yt_dlp/extractor/olympics.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py index a50c510cb..bbf83e531 100644 --- a/yt_dlp/extractor/olympics.py +++ b/yt_dlp/extractor/olympics.py @@ -4,7 +4,9 @@ ExtractorError, int_or_none, parse_iso8601, + parse_qs, try_get, + update_url, url_or_none, ) from ..utils.traversal import traverse_obj @@ -24,9 +26,6 @@ class OlympicsReplayIE(InfoExtractor): 'thumbnail': 'https://img.olympics.com/images/image/private/t_1-1_1280/primary/nua4o7zwyaznoaejpbk2', 'duration': 7017.0, }, - 'params': { - 'skip_download': True, - }, }, { 'url': 'https://olympics.com/en/original-series/episode/b-boys-and-b-girls-take-the-spotlight-breaking-life-road-to-paris-2024', 'info_dict': { @@ -74,7 +73,7 @@ def _extract_from_nextjs_data(self, webpage, video_id): is_live = traverse_obj(data, ('streamingStatus', {str})) == 'LIVE' m3u8_url = traverse_obj(data, ('videoUrl', {url_or_none})) or data['streamUrl'] - tokenized_url = m3u8_url if is_live else self._tokenize_url(m3u8_url, video_id) + tokenized_url = self._tokenize_url(m3u8_url, data['jwtToken'], is_live, video_id) try: formats, subtitles = self._extract_m3u8_formats_and_subtitles( @@ -95,10 +94,20 @@ def _extract_from_nextjs_data(self, webpage, video_id): }), } - def _tokenize_url(self, url, video_id): + def _tokenize_url(self, url, token, is_live, video_id): + return self._download_json( + 'https://metering.olympics.com/tokengenerator', video_id, + 'Downloading tokenized m3u8 url', query={ + **parse_qs(url), + 'url': update_url(url, query=None), + 'service-id': 'live' if is_live else 'vod', + 'user-auth': token, + })['data']['url'] + + def _legacy_tokenize_url(self, url, video_id): return self._download_json( 'https://olympics.com/tokenGenerator', video_id, - 'Downloading tokenized m3u8 url', query={'url': url}) + 'Downloading legacy tokenized m3u8 url', query={'url': url}) def _real_extract(self, url): video_id = self._match_id(url) @@ -130,7 +139,7 @@ def _real_extract(self, url): }) formats, subtitles = self._extract_m3u8_formats_and_subtitles( - self._tokenize_url(m3u8_url, video_uuid), video_uuid, 'mp4', m3u8_id='hls') + self._legacy_tokenize_url(m3u8_url, video_uuid), video_uuid, 'mp4', m3u8_id='hls') return { 'id': video_uuid, From e7d73bc4531ee3f91a46b15e218dcc1fbeb6226c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 4 Aug 2024 10:20:45 -0500 Subject: [PATCH 84/95] [ie/DiscoveryPlusItaly] Support sport and olympics URLs (#10655) Closes #10654 Authored by: bashonly --- yt_dlp/extractor/dplay.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index cdf84c52d..8d7707271 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -1147,13 +1147,19 @@ def _real_extract(self, url): class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video' + DPlayBaseIE._PATH_REGEX + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/it/video(?:/sport|/olympics)?' + DPlayBaseIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', 'only_matching': True, }, { 'url': 'https://www.discoveryplus.com/it/video/super-benny/trailer', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/it/video/olympics/dplus-sport-dplus-sport-sport/water-polo-greece-italy', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/it/video/sport/dplus-sport-dplus-sport-sport/lisa-vittozzi-allinferno-e-ritorno', + 'only_matching': True, }] _PRODUCT = 'dplus_it' From bb8bf1db993f59752d20b73b861bd55e40cf0e31 Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 5 Aug 2024 22:28:24 +0200 Subject: [PATCH 85/95] [jsinterp] Improve `slice` implementation (#10664) Authored by: seproDev --- test/test_jsinterp.py | 28 ++++++++++++++++++++++++++++ yt_dlp/jsinterp.py | 6 +++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index df92c8315..06840ed85 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -403,6 +403,34 @@ def test_split(self): self._test(jsi, [''], args=['', '-']) self._test(jsi, [], args=['', '']) + def test_slice(self): + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice()}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0)}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(5)}', [5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(99)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-2)}', [7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-99)}', [0, 1, 2, 3, 4, 5, 6, 7, 8]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 0)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, 0)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(0, 1)}', [0]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(3, 6)}', [3, 4, 5]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(1, -1)}', [1, 2, 3, 4, 5, 6, 7]) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-1, 1)}', []) + self._test('function f(){return [0, 1, 2, 3, 4, 5, 6, 7, 8].slice(-3, -1)}', [6, 7]) + self._test('function f(){return "012345678".slice()}', '012345678') + self._test('function f(){return "012345678".slice(0)}', '012345678') + self._test('function f(){return "012345678".slice(5)}', '5678') + self._test('function f(){return "012345678".slice(99)}', '') + self._test('function f(){return "012345678".slice(-2)}', '78') + self._test('function f(){return "012345678".slice(-99)}', '012345678') + self._test('function f(){return "012345678".slice(0, 0)}', '') + self._test('function f(){return "012345678".slice(1, 0)}', '') + self._test('function f(){return "012345678".slice(0, 1)}', '0') + self._test('function f(){return "012345678".slice(3, 6)}', '345') + self._test('function f(){return "012345678".slice(1, -1)}', '1234567') + self._test('function f(){return "012345678".slice(-1, 1)}', '') + self._test('function f(){return "012345678".slice(-3, -1)}', '67') + if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 851d4dc7b..ba059babb 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -709,9 +709,9 @@ def eval_method(): obj.reverse() return obj elif member == 'slice': - assertion(isinstance(obj, list), 'must be applied on a list') - assertion(len(argvals) == 1, 'takes exactly one argument') - return obj[argvals[0]:] + assertion(isinstance(obj, (list, str)), 'must be applied on a list or string') + assertion(len(argvals) <= 2, 'takes between 0 and 2 arguments') + return obj[slice(*argvals, None)] elif member == 'splice': assertion(isinstance(obj, list), 'must be applied on a list') assertion(argvals, 'takes one or more arguments') From c86891eb9434b4d7eec426d38c0c625b5e13cb2f Mon Sep 17 00:00:00 2001 From: sepro Date: Mon, 5 Aug 2024 22:36:11 +0200 Subject: [PATCH 86/95] [ie/youtube] Fix `n` function name extraction for player `b12cc44b` (#10668) Authored by: seproDev --- test/test_youtube_signature.py | 4 ++++ yt_dlp/extractor/youtube.py | 9 ++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index d37df7a2e..0f7ae34f4 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -179,6 +179,10 @@ 'https://www.youtube.com/s/player/20dfca59/player_ias.vflset/en_US/base.js', '-fLCxedkAk4LUTK2', 'O8kfRq1y1eyHGw', ), + ( + 'https://www.youtube.com/s/player/b12cc44b/player_ias.vflset/en_US/base.js', + 'keLa5R2U00sR9SQK', 'N1OGyujjEwMnLw', + ), ] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 88e1a28ae..46822cfde 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3180,6 +3180,7 @@ def _extract_n_function_name(self, jscode, player_url=None): # * b=String.fromCharCode(110),c=a.get(b))&&c=narray[idx](c) # * a.D&&(b="nn"[+a.D],c=a.get(b))&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") # * a.D&&(PL(a),b=a.j.n||null)&&(b=narray[0](b),a.set("n",b),narray.length||nfunc("") + # * a.D&&(b="nn"[+a.D],vL(a),c=a.j[b]||null)&&(c=narray[idx](c),a.set(b,c),narray.length||nfunc("") funcname, idx = self._search_regex( r'''(?x) (?: @@ -3187,7 +3188,13 @@ def _extract_n_function_name(self, jscode, player_url=None): (?: b=String\.fromCharCode\(110\)| (?P[a-zA-Z0-9_$.]+)&&\(b="nn"\[\+(?P=str_idx)\] - ),c=a\.get\(b\)\)&&\(c=| + ) + (?: + ,[a-zA-Z0-9_$]+\(a\))?,c=a\. + (?: + get\(b\)| + [a-zA-Z0-9_$]+\[b\]\|\|null + )\)&&\(c=| \b(?P[a-zA-Z0-9_$]+)= )(?P[a-zA-Z0-9_$]+)(?:\[(?P\d+)\])?\([a-zA-Z]\) (?(var),[a-zA-Z0-9_$]+\.set\("n"\,(?P=var)\),(?P=nfunc)\.length)''', From 406f4c2e47502fffc1b0c210b4ee6487c89a44cb Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:26:50 -0500 Subject: [PATCH 87/95] [ie/youtube] Change default player clients to `ios,web_creator` (#10674) Closes #10660 Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/youtube.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index dd78012a8..ca32e09bf 100644 --- a/README.md +++ b/README.md @@ -1767,7 +1767,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,tv` is used, but `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `ios` and `android`, with variants `_music` and `_creator` (e.g. `ios_creator`); and `mediaconnect`, `mweb`, `android_producer`, `android_testsuite`, `android_vr`, `web_safari`, `web_embedded`, `tv` and `tv_embedded` with no variants. By default, `ios,web_creator` is used, and `tv_embedded`, `web_creator` and `mediaconnect` are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. Most `android` clients will be given lowest priority since their formats are broken. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 46822cfde..224c9b988 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3744,7 +3744,7 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] broken_clients = [] - default = ['ios', 'tv'] + default = ['ios', 'web_creator'] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) From fc5eecfa31c9571b6031cc3968aaa0394be55d7a Mon Sep 17 00:00:00 2001 From: scribblemaniac Date: Mon, 5 Aug 2024 19:02:21 -0600 Subject: [PATCH 88/95] [ie/gem.cbc.ca:live] Fix extractor (#10565) Authored by: scribblemaniac, bashonly --- yt_dlp/extractor/cbc.py | 87 +++++++++++++++++++++++++++++------------ 1 file changed, 61 insertions(+), 26 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 373c9d2c9..40224f63f 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -806,11 +806,11 @@ class CBCGemLiveIE(InfoExtractor): 'title': 'Ottawa', 'description': 'The live TV channel and local programming from Ottawa', 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/CBC_OTT_VMS/Live_Channel_Static_Images/Ottawa_2880x1620.jpg', - 'is_live': True, + 'live_status': 'is_live', 'id': 'AyqZwxRqh8EH', 'ext': 'mp4', - 'timestamp': 1492106160, - 'upload_date': '20170413', + 'release_timestamp': 1492106160, + 'release_date': '20170413', 'uploader': 'CBCC-NEW', }, 'skip': 'Live might have ended', @@ -839,49 +839,84 @@ class CBCGemLiveIE(InfoExtractor): 'description': 'March 24, 2023 | President Biden’s Ottawa visit ends with big pledges from both countries. Plus, Gwyneth Paltrow testifies in her ski collision trial.', 'live_status': 'is_live', 'thumbnail': r're:https://images.gem.cbc.ca/v1/cbc-gem/live/.*', - 'timestamp': 1679706000, - 'upload_date': '20230325', + 'release_timestamp': 1679706000, + 'release_date': '20230325', }, 'params': {'skip_download': True}, 'skip': 'Live might have ended', }, + { # event replay (medianetlive) + 'url': 'https://gem.cbc.ca/live-event/42314', + 'md5': '297a9600f554f2258aed01514226a697', + 'info_dict': { + 'id': '42314', + 'ext': 'mp4', + 'live_status': 'was_live', + 'title': 'Women\'s Soccer - Canada vs New Zealand', + 'description': 'md5:36200e5f1a70982277b5a6ecea86155d', + 'thumbnail': r're:https://.+default\.jpg', + 'release_timestamp': 1721917200, + 'release_date': '20240725', + }, + 'params': {'skip_download': True}, + 'skip': 'Replay might no longer be available', + }, + { # event replay (medianetlive) + 'url': 'https://gem.cbc.ca/live-event/43273', + 'only_matching': True, + }, ] + _GEO_COUNTRIES = ['CA'] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) video_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['data'] - # Two types of metadata JSON + # Three types of video_info JSON: info in root, freeTv stream/item, event replay if not video_info.get('formattedIdMedia'): - video_info = traverse_obj( - video_info, (('freeTv', ('streams', ...)), 'items', lambda _, v: v['key'] == video_id, {dict}), - get_all=False, default={}) + if traverse_obj(video_info, ('event', 'key')) == video_id: + video_info = video_info['event'] + else: + video_info = traverse_obj(video_info, ( + ('freeTv', ('streams', ...)), 'items', + lambda _, v: v['key'].partition('-')[0] == video_id, any)) or {} video_stream_id = video_info.get('formattedIdMedia') if not video_stream_id: - raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) + raise ExtractorError( + 'Couldn\'t find video metadata, maybe this livestream is now offline', expected=True) - stream_data = self._download_json( - 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ - 'appCode': 'mpx', - 'connectionType': 'hd', - 'deviceType': 'ipad', - 'idMedia': video_stream_id, - 'multibitrate': 'true', - 'output': 'json', - 'tech': 'hls', - 'manifestType': 'desktop', - }) + live_status = 'was_live' if video_info.get('isVodEnabled') else 'is_live' + release_timestamp = traverse_obj(video_info, ('airDate', {parse_iso8601})) + + if live_status == 'is_live' and release_timestamp and release_timestamp > time.time(): + formats = [] + live_status = 'is_upcoming' + self.raise_no_formats('This livestream has not yet started', expected=True) + else: + stream_data = self._download_json( + 'https://services.radio-canada.ca/media/validation/v2/', video_id, query={ + 'appCode': 'medianetlive', + 'connectionType': 'hd', + 'deviceType': 'ipad', + 'idMedia': video_stream_id, + 'multibitrate': 'true', + 'output': 'json', + 'tech': 'hls', + 'manifestType': 'desktop', + }) + formats = self._extract_m3u8_formats( + stream_data['url'], video_id, 'mp4', live=live_status == 'is_live') return { 'id': video_id, - 'formats': self._extract_m3u8_formats(stream_data['url'], video_id, 'mp4', live=True), - 'is_live': True, + 'formats': formats, + 'live_status': live_status, + 'release_timestamp': release_timestamp, **traverse_obj(video_info, { - 'title': 'title', - 'description': 'description', + 'title': ('title', {str}), + 'description': ('description', {str}), 'thumbnail': ('images', 'card', 'url'), - 'timestamp': ('airDate', {parse_iso8601}), }), } From 4d9231208332d4c32364b8cd814bff8b20232cae Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 5 Aug 2024 21:50:06 -0500 Subject: [PATCH 89/95] [ie/niconico] Fix extractor (#10677) Closes #10662 Authored by: bashonly --- yt_dlp/extractor/niconico.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 9d7b010c5..179e7a9b1 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -40,7 +40,6 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -56,8 +55,8 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['未設定'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # File downloaded with and without credentials are different, so omit # the md5 field @@ -77,8 +76,8 @@ class NiconicoIE(InfoExtractor): 'view_count': int, 'genres': ['音楽・サウンド'], 'tags': ['Translation_Request', 'Kagamine_Rin', 'Rin_Original'], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # 'video exists but is marked as "deleted" # md5 is unstable @@ -112,7 +111,6 @@ class NiconicoIE(InfoExtractor): }, { # video not available via `getflv`; "old" HTML5 video 'url': 'http://www.nicovideo.jp/watch/sm1151009', - 'md5': 'f95a3d259172667b293530cc2e41ebda', 'info_dict': { 'id': 'sm1151009', 'ext': 'mp4', @@ -128,11 +126,10 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['ゲーム'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # "New" HTML5 video - # md5 is unstable 'url': 'http://www.nicovideo.jp/watch/sm31464864', 'info_dict': { 'id': 'sm31464864', @@ -149,12 +146,11 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['アニメ'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { # Video without owner 'url': 'http://www.nicovideo.jp/watch/sm18238488', - 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', 'info_dict': { 'id': 'sm18238488', 'ext': 'mp4', @@ -168,8 +164,8 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, 'genres': ['エンターテイメント'], 'tags': [], - 'expected_protocol': str, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -458,9 +454,11 @@ def _real_extract(self, url): if video_id.startswith('so'): video_id = self._match_id(handle.url) - api_data = self._parse_json(self._html_search_regex( - 'data-api-data="([^"]+)"', webpage, - 'API data', default='{}'), video_id) + api_data = traverse_obj( + self._parse_json(self._html_search_meta('server-response', webpage) or '', video_id), + ('data', 'response', {dict})) + if not api_data: + raise ExtractorError('Server response data not found') except ExtractorError as e: try: api_data = self._download_json( From a065086640e888e8d58c615d52ed2f4f4e4c9d18 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 6 Aug 2024 03:03:12 +0000 Subject: [PATCH 90/95] Release 2024.08.06 Created by: bashonly :ci skip all :ci run dl --- CONTRIBUTORS | 2 ++ Changelog.md | 15 +++++++++++++++ yt_dlp/version.py | 6 +++--- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 2180ecfe2..489ab7da8 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -655,3 +655,5 @@ iancmy mokrueger luvyana szantnerb +hugepower +scribblemaniac diff --git a/Changelog.md b/Changelog.md index 73bf828a6..0b96ab29c 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,21 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.08.06 + +#### Core changes +- **jsinterp**: [Improve `slice` implementation](https://github.com/yt-dlp/yt-dlp/commit/bb8bf1db993f59752d20b73b861bd55e40cf0e31) ([#10664](https://github.com/yt-dlp/yt-dlp/issues/10664)) by [seproDev](https://github.com/seproDev) + +#### Extractor changes +- **discoveryplusitaly**: [Support sport and olympics URLs](https://github.com/yt-dlp/yt-dlp/commit/e7d73bc4531ee3f91a46b15e218dcc1fbeb6226c) ([#10655](https://github.com/yt-dlp/yt-dlp/issues/10655)) by [bashonly](https://github.com/bashonly) +- **gem.cbc.ca**: live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/fc5eecfa31c9571b6031cc3968aaa0394be55d7a) ([#10565](https://github.com/yt-dlp/yt-dlp/issues/10565)) by [bashonly](https://github.com/bashonly), [scribblemaniac](https://github.com/scribblemaniac) +- **niconico**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4d9231208332d4c32364b8cd814bff8b20232cae) ([#10677](https://github.com/yt-dlp/yt-dlp/issues/10677)) by [bashonly](https://github.com/bashonly) +- **olympics**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/919540a9644e55deb78cdd6751757ec8fdaf76f4) ([#10625](https://github.com/yt-dlp/yt-dlp/issues/10625)) by [bashonly](https://github.com/bashonly) +- **youku**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0088c6de23d832b117061a33e984dc452d992e9c) ([#10626](https://github.com/yt-dlp/yt-dlp/issues/10626)) by [hugepower](https://github.com/hugepower) +- **youtube** + - [Change default player clients to `ios,web_creator`](https://github.com/yt-dlp/yt-dlp/commit/406f4c2e47502fffc1b0c210b4ee6487c89a44cb) ([#10674](https://github.com/yt-dlp/yt-dlp/issues/10674)) by [bashonly](https://github.com/bashonly) + - [Fix `n` function name extraction for player `b12cc44b`](https://github.com/yt-dlp/yt-dlp/commit/c86891eb9434b4d7eec426d38c0c625b5e13cb2f) ([#10668](https://github.com/yt-dlp/yt-dlp/issues/10668)) by [seproDev](https://github.com/seproDev) + ### 2024.08.01 #### Core changes diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 81d1c2c96..6633a11b9 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.08.01' +__version__ = '2024.08.06' -RELEASE_GIT_HEAD = 'ffd7781d6588926f820b44a34b9e6e3068fb9f97' +RELEASE_GIT_HEAD = '4d9231208332d4c32364b8cd814bff8b20232cae' VARIANT = None @@ -12,4 +12,4 @@ ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.08.01' +_pkg_version = '2024.08.06' From 49f3741a820ed142f6866317c2e7d247b130960e Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Mon, 12 Aug 2024 04:12:46 -0500 Subject: [PATCH 91/95] [ie/youtube] Support excluding `player_client`s in extractor-arg (#10710) Closes #10699 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 224c9b988..2501398ba 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -1339,6 +1339,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): short_client_name(client): client for client in ('android', 'android_creator', 'android_music') } + _DEFAULT_CLIENTS = ('ios', 'web_creator') _GEO_BYPASS = False @@ -3744,17 +3745,19 @@ def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, def _get_requested_clients(self, url, smuggled_data): requested_clients = [] broken_clients = [] - default = ['ios', 'web_creator'] + excluded_clients = [] allowed_clients = sorted( (client for client in INNERTUBE_CLIENTS if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client == 'default': - requested_clients.extend(default) + requested_clients.extend(self._DEFAULT_CLIENTS) elif client == 'all': requested_clients.extend(allowed_clients) + elif client.startswith('-'): + excluded_clients.append(client[1:]) elif client not in allowed_clients: - self.report_warning(f'Skipping unsupported client {client}') + self.report_warning(f'Skipping unsupported client "{client}"') elif client in self._BROKEN_CLIENTS.values(): broken_clients.append(client) else: @@ -3762,7 +3765,12 @@ def _get_requested_clients(self, url, smuggled_data): # Force deprioritization of _BROKEN_CLIENTS for format de-duplication requested_clients.extend(broken_clients) if not requested_clients: - requested_clients = default + requested_clients.extend(self._DEFAULT_CLIENTS) + for excluded_client in excluded_clients: + if excluded_client in requested_clients: + requested_clients.remove(excluded_client) + if not requested_clients: + raise ExtractorError('No player clients have been requested', expected=True) if smuggled_data.get('is_music_url') or self.is_music_url(url): for requested_client in requested_clients: From 232e6db30c474d1b387e405342f34173ceeaf832 Mon Sep 17 00:00:00 2001 From: Hank Brown Date: Tue, 13 Aug 2024 18:26:55 -0500 Subject: [PATCH 92/95] [ie/PatreonCampaign] Support API URLs (#10734) Closes #10733 Authored by: hibes, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/patreon.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index 7d6e8439c..4489d533a 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -420,7 +420,7 @@ def _get_comments(self, post_id): class PatreonCampaignIE(PatreonBaseIE): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P\d+))|(?P[-\w]+))' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m|api/campaigns)/(?P\d+)|(?P[-\w]+))' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { @@ -442,25 +442,44 @@ class PatreonCampaignIE(PatreonBaseIE): 'url': 'https://www.patreon.com/m/4767637/posts', 'info_dict': { 'title': 'Not Just Bikes', - 'channel_follower_count': int, 'id': '4767637', 'channel_id': '4767637', 'channel_url': 'https://www.patreon.com/notjustbikes', - 'description': 'md5:595c6e7dca76ae615b1d38c298a287a1', + 'description': 'md5:9f4b70051216c4d5c58afe580ffc8d0f', 'age_limit': 0, 'channel': 'Not Just Bikes', 'uploader_url': 'https://www.patreon.com/notjustbikes', - 'uploader': 'Not Just Bikes', + 'uploader': 'Jason', 'uploader_id': '37306634', 'thumbnail': r're:^https?://.*$', }, 'playlist_mincount': 71, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769/posts', + 'info_dict': { + 'title': 'Second Thought', + 'channel_follower_count': int, + 'id': '4243769', + 'channel_id': '4243769', + 'channel_url': 'https://www.patreon.com/secondthought', + 'description': 'md5:69c89a3aba43efdb76e85eb023e8de8b', + 'age_limit': 0, + 'channel': 'Second Thought', + 'uploader_url': 'https://www.patreon.com/secondthought', + 'uploader': 'JT Chapman', + 'uploader_id': '32718287', + 'thumbnail': r're:^https?://.*$', + }, + 'playlist_mincount': 201, }, { 'url': 'https://www.patreon.com/dissonancepod/posts', 'only_matching': True, }, { 'url': 'https://www.patreon.com/m/5932659', 'only_matching': True, + }, { + 'url': 'https://www.patreon.com/api/campaigns/4243769', + 'only_matching': True, }] @classmethod From b43bd864851f2862e26caa85461c5d825d49d463 Mon Sep 17 00:00:00 2001 From: N/Ame <173015200+grqz@users.noreply.github.com> Date: Fri, 16 Aug 2024 07:33:41 +1200 Subject: [PATCH 93/95] [ie/bilibili] Fix festival URL support (#10740) Closes #10739 Authored by: grqz, bashonly Co-authored-by: bashonly <88596187+bashonly@users.noreply.github.com> --- yt_dlp/extractor/bilibili.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index a84b7a6f7..3163df8ab 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -298,7 +298,7 @@ def _get_interactive_entries(self, video_id, cid, metainfo, headers=None): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/[^/?#]+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -622,6 +622,10 @@ class BiliBiliIE(BilibiliBaseIE): 'ext': 'mp4', }, 'skip': 'geo-restricted', + }, { + 'note': 'has - in the last path segment of the url', + 'url': 'https://www.bilibili.com/festival/bh3-7th?bvid=BV1tr4y1f7p2&', + 'only_matching': True, }] def _real_extract(self, url): From cc88a54bb1ef285154775f8a6a413335ce4c71ce Mon Sep 17 00:00:00 2001 From: Christopher Schreiner Date: Thu, 15 Aug 2024 21:50:08 +0200 Subject: [PATCH 94/95] [ie/adn] Fix extractors (#10749) Closes #10748 Authored by: infanf --- yt_dlp/extractor/adn.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index 337071794..c8a261375 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -49,9 +49,9 @@ class ADNBaseIE(InfoExtractor): class ADNIE(ADNBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.com/(?:(?Pde)/)?video/[^/?#]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?Pde)/)?video/[^/?#]+/(?P\d+)' _TESTS = [{ - 'url': 'https://animationdigitalnetwork.com/video/fruits-basket/9841-episode-1-a-ce-soir', + 'url': 'https://animationdigitalnetwork.com/video/558-fruits-basket/9841-episode-1-a-ce-soir', 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { 'id': '9841', @@ -71,10 +71,7 @@ class ADNIE(ADNBaseIE): }, 'skip': 'Only available in French and German speaking Europe', }, { - 'url': 'http://animedigitalnetwork.com/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'only_matching': True, - }, { - 'url': 'https://animationdigitalnetwork.com/de/video/the-eminence-in-shadow/23550-folge-1', + 'url': 'https://animationdigitalnetwork.com/de/video/973-the-eminence-in-shadow/23550-folge-1', 'md5': '5c5651bf5791fa6fcd7906012b9d94e8', 'info_dict': { 'id': '23550', @@ -167,7 +164,7 @@ def _perform_login(self, username, password): 'username': username, })) or {}).get('accessToken') if access_token: - self._HEADERS = {'authorization': 'Bearer ' + access_token} + self._HEADERS['Authorization'] = f'Bearer {access_token}' except ExtractorError as e: message = None if isinstance(e.cause, HTTPError) and e.cause.status == 401: @@ -178,6 +175,7 @@ def _perform_login(self, username, password): def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') + self._HEADERS['X-Target-Distribution'] = lang or 'fr' video_base_url = self._PLAYER_BASE_URL + f'video/{video_id}/' player = self._download_json( video_base_url + 'configuration', video_id, @@ -218,7 +216,6 @@ def _real_extract(self, url): links_data = self._download_json( links_url, video_id, 'Downloading links JSON metadata', headers={ 'X-Player-Token': authorization, - 'X-Target-Distribution': lang or 'fr', **self._HEADERS, }, query={ 'freeWithAds': 'true', @@ -257,6 +254,7 @@ def _real_extract(self, url): load_balancer_data = self._download_json( load_balancer_url, video_id, f'Downloading {format_id} {quality} JSON metadata', + headers=self._HEADERS, fatal=False) or {} m3u8_url = load_balancer_data.get('location') if not m3u8_url: @@ -277,7 +275,7 @@ def _real_extract(self, url): video = (self._download_json( self._API_BASE_URL + f'video/{video_id}', video_id, - 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + 'Downloading additional video metadata', fatal=False, headers=self._HEADERS) or {}).get('video') or {} show = video.get('show') or {} return { @@ -299,9 +297,9 @@ def _real_extract(self, url): class ADNSeasonIE(ADNBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.com/(?:(?Pde)/)?video/(?P[^/?#]+)/?(?:$|[#?])' + _VALID_URL = r'https?://(?:www\.)?animationdigitalnetwork\.com/(?:(?Pde)/)?video/(?P\d+)[^/?#]*/?(?:$|[#?])' _TESTS = [{ - 'url': 'https://animationdigitalnetwork.com/video/tokyo-mew-mew-new', + 'url': 'https://animationdigitalnetwork.com/video/911-tokyo-mew-mew-new', 'playlist_count': 12, 'info_dict': { 'id': '911', @@ -312,16 +310,14 @@ class ADNSeasonIE(ADNBaseIE): def _real_extract(self, url): lang, video_show_slug = self._match_valid_url(url).group('lang', 'id') + self._HEADERS['X-Target-Distribution'] = lang or 'fr' show = self._download_json( f'{self._API_BASE_URL}show/{video_show_slug}/', video_show_slug, 'Downloading show JSON metadata', headers=self._HEADERS)['show'] show_id = str(show['id']) episodes = self._download_json( f'{self._API_BASE_URL}video/show/{show_id}', video_show_slug, - 'Downloading episode list', headers={ - 'X-Target-Distribution': lang or 'fr', - **self._HEADERS, - }, query={ + 'Downloading episode list', headers=self._HEADERS, query={ 'order': 'asc', 'limit': '-1', }) From d62fef7e07d454c0d2ba2d69fb96d691dba1ded0 Mon Sep 17 00:00:00 2001 From: kclauhk <78251477+kclauhk@users.noreply.github.com> Date: Fri, 16 Aug 2024 03:53:37 +0800 Subject: [PATCH 95/95] [ie/facebook:ads] Fix extractor (#10704) Closes #10701 Authored by: kclauhk --- yt_dlp/extractor/facebook.py | 43 ++++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 6aba477a6..a43ffe95e 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -963,6 +963,7 @@ class FacebookAdsIE(InfoExtractor): 'id': '899206155126718', 'ext': 'mp4', 'title': 'video by Kandao', + 'description': 'md5:0822724069e3aca97cbed5dabbab282e', 'uploader': 'Kandao', 'uploader_id': '774114102743284', 'uploader_url': r're:^https?://.*', @@ -971,6 +972,22 @@ class FacebookAdsIE(InfoExtractor): 'upload_date': '20231214', 'like_count': int, }, + }, { + # key 'watermarked_video_sd_url' missing + 'url': 'https://www.facebook.com/ads/library/?id=501152689226254', + 'info_dict': { + 'id': '501152689226254', + 'ext': 'mp4', + 'title': 'video by mat.nawrocki', + 'description': 'md5:02a446ace7ff8c3c37a2892922492490', + 'uploader': 'mat.nawrocki', + 'uploader_id': '148586968341456', + 'uploader_url': r're:^https?://.*', + 'timestamp': 1723452305, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20240812', + 'like_count': int, + }, }, { 'url': 'https://www.facebook.com/ads/library/?id=893637265423481', 'info_dict': { @@ -1017,34 +1034,42 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - post_data = [self._parse_json(j, video_id, fatal=False) - for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)] - data = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False) + post_data = traverse_obj( + re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})', webpage), (..., {json.loads})) + data = get_first(post_data, ( + 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., + 'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict})) if not data: raise ExtractorError('Unable to extract ad data') title = data.get('title') if not title or title == '{{product.name}}': title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data) + markup_id = traverse_obj(data, ('body', '__m', {str})) + markup = traverse_obj(post_data, ( + ..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id), + ..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any)) - info_dict = traverse_obj(data, { - 'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}), + info_dict = merge_dicts({ + 'title': title, + 'description': markup or None, + }, traverse_obj(data, { + 'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}), 'uploader': ('page_name', {str}), 'uploader_id': ('page_id', {str_or_none}), 'uploader_url': ('page_profile_uri', {url_or_none}), 'timestamp': ('creation_time', {int_or_none}), 'like_count': ('page_like_count', {int_or_none}), - }) + })) entries = [] for idx, entry in enumerate(traverse_obj( - data, (('videos', 'cards'), lambda _, v: any(url_or_none(v[f]) for f in self._FORMATS_MAP))), 1, + data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1, ): entries.append({ 'id': f'{video_id}_{idx}', 'title': entry.get('title') or title, - 'description': entry.get('link_description') or info_dict.get('description'), + 'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'), 'thumbnail': url_or_none(entry.get('video_preview_image_url')), 'formats': self._extract_formats(entry), })