From 58786a10f212bd63f9ad1d0b4d9e4d31c3b385e2 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Sun, 25 Jun 2023 20:10:00 +0530 Subject: [PATCH 001/218] [extractor/youtube] Add extractor-arg `formats` Closes #7417 --- README.md | 3 +-- yt_dlp/extractor/youtube.py | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 4de4ece96..d89bb204e 100644 --- a/README.md +++ b/README.md @@ -1805,8 +1805,7 @@ #### youtube * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) * `max_comments`: Limit the amount of comments to gather. Comma-separated list of integers representing `max-comments,max-parents,max-replies,max-replies-per-thread`. Default is `all,all,all,all` * E.g. `all,all,1000,10` will get a maximum of 1000 replies total, with up to 10 replies per thread. `1000,all,100` will get a maximum of 1000 comments, with a maximum of 100 replies total -* `include_duplicate_formats`: Extract formats with identical content but different URLs or protocol. This is useful if some of the formats are unavailable or throttled. -* `include_incomplete_formats`: Extract formats that cannot be downloaded completely (live dash and post-live m3u8) +* `formats`: Change the types of formats to return. `dashy` (convert http to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a0d0a601a..bdc631ccb 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3752,7 +3752,12 @@ def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, l 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...)) - all_formats = self._configuration_arg('include_duplicate_formats') + format_types = self._configuration_arg('formats') + all_formats = 'duplicate' in format_types + if self._configuration_arg('include_duplicate_formats'): + all_formats = True + self._downloader.deprecated_feature('[youtube] include_duplicate_formats extractor argument is deprecated. ' + 'Use formats=duplicate extractor argument instead') def build_fragments(f): return LazyList({ @@ -3892,18 +3897,23 @@ def build_fragments(f): if single_stream and dct.get('ext'): dct['container'] = dct['ext'] + '_dash' - if all_formats and dct['filesize']: + if (all_formats or 'dashy' in format_types) and dct['filesize']: yield { **dct, 'format_id': f'{dct["format_id"]}-dashy' if all_formats else dct['format_id'], 'protocol': 'http_dash_segments', 'fragments': build_fragments(dct), } - dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} - yield dct + if all_formats or 'dashy' not in format_types: + dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} + yield dct needs_live_processing = self._needs_live_processing(live_status, duration) - skip_bad_formats = not self._configuration_arg('include_incomplete_formats') + skip_bad_formats = 'incomplete' not in format_types + if self._configuration_arg('include_incomplete_formats'): + skip_bad_formats = False + self._downloader.deprecated_feature('[youtube] include_incomplete_formats extractor argument is deprecated. ' + 'Use formats=incomplete extractor argument instead') skip_manifests = set(self._configuration_arg('skip')) if (not self.get_param('youtube_include_hls_manifest', True) @@ -3915,7 +3925,7 @@ def build_fragments(f): skip_manifests.add('dash') if self._configuration_arg('include_live_dash'): self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. ' - 'Use include_incomplete_formats extractor argument instead') + 'Use formats=incomplete extractor argument instead') elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': skip_manifests.add('dash') From f0a1ff118145b6449982ba401f9a9f656ecd8062 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sun, 25 Jun 2023 13:13:28 -0500 Subject: [PATCH 002/218] [extractor/qdance] Add extractor (#7420) Closes #7385 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/qdance.py | 150 ++++++++++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 yt_dlp/extractor/qdance.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 49a3f39d3..06340fcd8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1531,6 +1531,7 @@ ) from .puls4 import Puls4IE from .pyvideo import PyvideoIE +from .qdance import QDanceIE from .qingting import QingTingIE from .qqmusic import ( QQMusicIE, diff --git a/yt_dlp/extractor/qdance.py b/yt_dlp/extractor/qdance.py new file mode 100644 index 000000000..d817677f0 --- /dev/null +++ b/yt_dlp/extractor/qdance.py @@ -0,0 +1,150 @@ +import json +import time + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + jwt_decode_hs256, + str_or_none, + traverse_obj, + try_call, + url_or_none, +) + + +class QDanceIE(InfoExtractor): + _NETRC_MACHINE = 'qdance' + _VALID_URL = r'https?://(?:www\.)?q-dance\.com/network/(?:library|live)/(?P\d+)' + _TESTS = [{ + 'note': 'vod', + 'url': 'https://www.q-dance.com/network/library/146542138', + 'info_dict': { + 'id': '146542138', + 'ext': 'mp4', + 'title': 'Sound Rush [LIVE] | Defqon.1 Weekend Festival 2022 | Friday | RED', + 'display_id': 'sound-rush-live-v3-defqon-1-weekend-festival-2022-friday-red', + 'description': 'Relive Defqon.1 - Primal Energy 2022 with the sounds of Sound Rush LIVE at the RED on Friday! 🔥', + 'season': 'Defqon.1 Weekend Festival 2022', + 'season_id': '31840632', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1674829540-20220624171509-220624171509_delio_dn201093-2.jpg', + 'availability': 'premium_only', + 'duration': 1829, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'livestream', + 'url': 'https://www.q-dance.com/network/live/149170353', + 'info_dict': { + 'id': '149170353', + 'ext': 'mp4', + 'title': r're:^Defqon\.1 2023 - Friday - RED', + 'display_id': 'defqon-1-2023-friday-red', + 'description': 'md5:3c73fbbd4044e578e696adfc64019163', + 'season': 'Defqon.1 Weekend Festival 2023', + 'season_id': '141735599', + 'series': 'Defqon.1', + 'series_id': '31840378', + 'thumbnail': 'https://images.q-dance.network/1686849069-area-thumbs_red.png', + 'availability': 'subscriber_only', + 'live_status': 'is_live', + 'channel_id': 'qdancenetwork.video_149170353', + }, + 'skip': 'Completed livestream', + }] + + _access_token = None + _refresh_token = None + + def _call_login_api(self, data, note='Logging in'): + login = self._download_json( + 'https://members.id-t.com/api/auth/login', None, note, headers={ + 'content-type': 'application/json', + 'brand': 'qdance', + 'origin': 'https://www.q-dance.com', + 'referer': 'https://www.q-dance.com/', + }, data=json.dumps(data, separators=(',', ':')).encode(), + expected_status=lambda x: True) + + tokens = traverse_obj(login, ('data', { + '_id-t-accounts-token': ('accessToken', {str}), + '_id-t-accounts-refresh': ('refreshToken', {str}), + '_id-t-accounts-id-token': ('idToken', {str}), + })) + + if not tokens.get('_id-t-accounts-token'): + error = ': '.join(traverse_obj(login, ('error', ('code', 'message'), {str}))) + if 'validation_error' not in error: + raise ExtractorError(f'Q-Dance API said "{error}"') + msg = 'Invalid username or password' if 'email' in data else 'Refresh token has expired' + raise ExtractorError(msg, expected=True) + + for name, value in tokens.items(): + self._set_cookie('.q-dance.com', name, value) + + def _perform_login(self, username, password): + self._call_login_api({'email': username, 'password': password}) + + def _real_initialize(self): + cookies = self._get_cookies('https://www.q-dance.com/') + self._refresh_token = try_call(lambda: cookies['_id-t-accounts-refresh'].value) + self._access_token = try_call(lambda: cookies['_id-t-accounts-token'].value) + if not self._access_token: + self.raise_login_required() + + def _get_auth(self): + if (try_call(lambda: jwt_decode_hs256(self._access_token)['exp']) or 0) <= int(time.time() - 120): + if not self._refresh_token: + raise ExtractorError( + 'Cannot refresh access token, login with yt-dlp or refresh cookies in browser') + self._call_login_api({'refreshToken': self._refresh_token}, note='Refreshing access token') + self._real_initialize() + + return {'Authorization': self._access_token} + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._search_nuxt_data(webpage, video_id, traverse=('data', 0, 'data')) + + def extract_availability(level): + level = int_or_none(level) or 0 + return self._availability( + needs_premium=(level >= 20), needs_subscription=(level >= 15), needs_auth=True) + + info = traverse_obj(data, { + 'title': ('title', {str.strip}), + 'description': ('description', {str.strip}), + 'display_id': ('slug', {str}), + 'thumbnail': ('thumbnail', {url_or_none}), + 'duration': ('durationInSeconds', {int_or_none}, {lambda x: x or None}), + 'availability': ('subscription', 'level', {extract_availability}), + 'is_live': ('type', {lambda x: x.lower() == 'live'}), + 'artist': ('acts', ..., {str}), + 'series': ('event', 'title', {str.strip}), + 'series_id': ('event', 'id', {str_or_none}), + 'season': ('eventEdition', 'title', {str.strip}), + 'season_id': ('eventEdition', 'id', {str_or_none}), + 'channel_id': ('pubnub', 'channelName', {str}), + }) + + stream = self._download_json( + f'https://dc9h6qmsoymbq.cloudfront.net/api/content/videos/{video_id}/url', + video_id, headers=self._get_auth(), expected_status=401) + + m3u8_url = traverse_obj(stream, ('data', 'url', {url_or_none})) + if not m3u8_url and traverse_obj(stream, ('error', 'code')) == 'unauthorized': + raise ExtractorError('Your account does not have access to this content', expected=True) + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, fatal=False, live=True) if m3u8_url else [] + if not formats: + self.raise_no_formats('No active streams found', expected=bool(info.get('is_live'))) + + return { + **info, + 'id': video_id, + 'formats': formats, + } From 5e16cf92eb496b7c1541a6b1d727cb87542984db Mon Sep 17 00:00:00 2001 From: nnoboa <90611593+nnoboa@users.noreply.github.com> Date: Sun, 25 Jun 2023 16:22:38 -0400 Subject: [PATCH 003/218] [extractor/AdultSwim] Extract subtitles from m3u8 (#7421) Authored by: nnoboa Closes #6191 --- yt_dlp/extractor/adultswim.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/adultswim.py b/yt_dlp/extractor/adultswim.py index bd29eb43e..daaeddeb6 100644 --- a/yt_dlp/extractor/adultswim.py +++ b/yt_dlp/extractor/adultswim.py @@ -170,8 +170,10 @@ def _real_extract(self, url): continue ext = determine_ext(asset_url, mimetype2ext(asset.get('mime_type'))) if ext == 'm3u8': - info['formats'].extend(self._extract_m3u8_formats( - asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + fmts, subs = self._extract_m3u8_formats_and_subtitles( + asset_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + info['formats'].extend(fmts) + self._merge_subtitles(subs, target=info['subtitles']) elif ext == 'f4m': continue # info['formats'].extend(self._extract_f4m_formats( From ef8509c300ea50da86aea447eb214d3d6f6db6bb Mon Sep 17 00:00:00 2001 From: bashonly Date: Sun, 25 Jun 2023 17:04:42 -0500 Subject: [PATCH 004/218] [extractor/kick] Fix `_VALID_URL` Closes #7384 Authored by: bashonly --- yt_dlp/extractor/kick.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index 765ffa0c8..be1dfd4b1 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -30,7 +30,7 @@ def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, * class KickIE(KickBaseIE): - _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P[\w_]+)' + _VALID_URL = r'https?://(?:www\.)?kick\.com/(?!(?:video|categories|search|auth)(?:[/?#]|$))(?P[\w-]+)' _TESTS = [{ 'url': 'https://kick.com/yuppy', 'info_dict': { From d949c10c45bfc359bdacd52e6a180169b8128958 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 26 Jun 2023 07:25:47 +0530 Subject: [PATCH 005/218] [extractor/youtube] Process `post_live` over 2 hours --- yt_dlp/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index bdc631ccb..d5607975e 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3737,7 +3737,7 @@ def append_client(*client_names): def _needs_live_processing(self, live_status, duration): if (live_status == 'is_live' and self.get_param('live_from_start') - or live_status == 'post_live' and (duration or 0) > 4 * 3600): + or live_status == 'post_live' and (duration or 0) > 2 * 3600): return live_status def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): @@ -4238,7 +4238,7 @@ def is_bad_format(fmt): for fmt in filter(is_bad_format, formats): fmt['preference'] = (fmt.get('preference') or -1) - 10 - fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ') + fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 2 hours)', delim=' ') if needs_live_processing: self._prepare_live_from_start_formats( From 8a8af356e3bba98a7f7d333aff0777d5d92130c8 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 26 Jun 2023 16:13:31 +0530 Subject: [PATCH 006/218] [downloader/aria2c] Add `--no-conf` Closes #7404 --- yt_dlp/downloader/external.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index 007689a8c..f637a100b 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -271,7 +271,7 @@ def _call_downloader(self, tmpfilename, info_dict): return super()._call_downloader(tmpfilename, info_dict) def _make_cmd(self, tmpfilename, info_dict): - cmd = [self.exe, '-c', + cmd = [self.exe, '-c', '--no-conf', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', '--http-accept-gzip=true', '--file-allocation=none', '-x16', '-j16', '-s16'] if 'fragments' in info_dict: From f393bbe724b1fc6c7f754a5da507e807b2b40ad2 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 26 Jun 2023 16:14:20 +0530 Subject: [PATCH 007/218] [extractor/sbs] Python 3.7 compat Closes #7410 --- yt_dlp/extractor/sbs.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index ac0b6de20..119106e8e 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -139,8 +139,8 @@ def _real_extract(self, url): 'release_year': ('releaseYear', {int_or_none}), 'duration': ('duration', ({float_or_none}, {parse_duration})), 'is_live': ('liveStream', {bool}), - 'age_limit': ( - ('classificationID', 'contentRating'), {str.upper}, {self._AUS_TV_PARENTAL_GUIDELINES.get}), + 'age_limit': (('classificationID', 'contentRating'), {str.upper}, { + lambda x: self._AUS_TV_PARENTAL_GUIDELINES.get(x)}), # dict.get is unhashable in py3.7 }, get_all=False), **traverse_obj(media, { 'categories': (('genres', ...), ('taxonomy', ('genre', 'subgenre'), 'name'), {str}), From 91302ed349f34dc26cc1d661bb45a4b71f4417f7 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Mon, 26 Jun 2023 16:19:49 +0530 Subject: [PATCH 008/218] [utils] clean_podcast_url: Handle protocol in redirect URL Closes #7430 --- yt_dlp/utils/_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index de51f6208..f68cdb968 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5113,7 +5113,7 @@ def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', def clean_podcast_url(url): - return re.sub(r'''(?x) + url = re.sub(r'''(?x) (?: (?: chtbl\.com/track| @@ -5127,6 +5127,7 @@ def clean_podcast_url(url): st\.fm # https://podsights.com/docs/ )/e )/''', '', url) + return re.sub(r'^\w+://(\w+://)', r'\1', url) _HEX_TABLE = '0123456789abcdef' From 5b4b92769afcc398475e481bfa839f1158902fe9 Mon Sep 17 00:00:00 2001 From: Aman Salwan <121633121+AmanSal1@users.noreply.github.com> Date: Wed, 28 Jun 2023 01:58:23 +0530 Subject: [PATCH 009/218] [extractor/crunchyroll:music] Fix `_VALID_URL` (#7439) Closes #7419 Authored by: AmanSal1, rdamas Co-authored-by: Robert Damas --- yt_dlp/extractor/crunchyroll.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index d4a21616b..910504ed2 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -490,8 +490,21 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): _VALID_URL = r'''(?x) https?://(?:www\.)?crunchyroll\.com/ (?P(?:\w{2}(?:-\w{2})?/)?) - watch/(?Pconcert|musicvideo)/(?P\w{10})''' + watch/(?Pconcert|musicvideo)/(?P\w+)''' _TESTS = [{ + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79', + 'info_dict': { + 'ext': 'mp4', + 'id': 'MV5B02C79', + 'display_id': 'egaono-hana', + 'title': 'Egaono Hana', + 'track': 'Egaono Hana', + 'artist': 'Goose house', + 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', + 'genre': ['J-Pop'], + }, + 'params': {'skip_download': 'm3u8'}, + }, { 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C', 'info_dict': { 'ext': 'mp4', @@ -519,11 +532,14 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): }, 'params': {'skip_download': 'm3u8'}, }, { - 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'url': 'https://www.crunchyroll.com/de/watch/musicvideo/MV5B02C79/egaono-hana', 'only_matching': True, }, { 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135/live-is-smile-always-364joker-at-yokohama-arena', 'only_matching': True, + }, { + 'url': 'https://www.crunchyroll.com/watch/musicvideo/MV88BB7F2C/crossing-field', + 'only_matching': True, }] _API_ENDPOINT = 'music' From 8f05fbae2a79ce0713077ccc68b354e63216bf20 Mon Sep 17 00:00:00 2001 From: Xiao Han <38774211+meliber@users.noreply.github.com> Date: Tue, 27 Jun 2023 16:16:57 -0500 Subject: [PATCH 010/218] [extractor/abc] Fix extraction (#7434) Closes #6433 Authored by: meliber --- yt_dlp/extractor/abc.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index 0ca76b85a..f56133eb3 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -12,6 +12,7 @@ int_or_none, parse_iso8601, str_or_none, + traverse_obj, try_get, unescapeHTML, update_url_query, @@ -85,6 +86,15 @@ class ABCIE(InfoExtractor): 'uploader': 'Behind the News', 'uploader_id': 'behindthenews', } + }, { + 'url': 'https://www.abc.net.au/news/2023-06-25/wagner-boss-orders-troops-back-to-bases-to-avoid-bloodshed/102520540', + 'info_dict': { + 'id': '102520540', + 'title': 'Wagner Group retreating from Russia, leader Prigozhin to move to Belarus', + 'ext': 'mp4', + 'description': 'Wagner troops leave Rostov-on-Don and\xa0Yevgeny Prigozhin will move to Belarus under a deal brokered by Belarusian President Alexander Lukashenko to end the mutiny.', + 'thumbnail': 'https://live-production.wcms.abc-cdn.net.au/0c170f5b57f0105c432f366c0e8e267b?impolicy=wcms_crop_resize&cropH=2813&cropW=5000&xPos=0&yPos=249&width=862&height=485', + } }] def _real_extract(self, url): @@ -107,7 +117,7 @@ def _real_extract(self, url): video = True if mobj is None: - mobj = re.search(r'(?P)"sources": (?P\[[^\]]+\]),', webpage) + mobj = re.search(r'(?P)"(?:sources|files|renditions)":\s*(?P\[[^\]]+\])', webpage) if mobj is None: mobj = re.search( r'inline(?PVideo|Audio|YouTube)Data\.push\((?P[^)]+)\);', @@ -121,7 +131,8 @@ def _real_extract(self, url): urls_info = self._parse_json( mobj.group('json_data'), video_id, transform_source=js_to_json) youtube = mobj.group('type') == 'YouTube' - video = mobj.group('type') == 'Video' or urls_info[0]['contentType'] == 'video/mp4' + video = mobj.group('type') == 'Video' or traverse_obj( + urls_info, (0, ('contentType', 'MIMEType')), get_all=False) == 'video/mp4' if not isinstance(urls_info, list): urls_info = [urls_info] From a2be9781fbf4d7e4db245c277ca2ecc41cf3a7b2 Mon Sep 17 00:00:00 2001 From: bashonly Date: Tue, 27 Jun 2023 16:50:02 -0500 Subject: [PATCH 011/218] [extractor/Douyin] Fix extraction from webpage Closes #7431 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 9c6d74007..2f491c317 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -1015,18 +1015,16 @@ def _real_extract(self, url): self.to_screen(f'{e}; trying with webpage') webpage = self._download_webpage(url, video_id) - render_data_json = self._search_regex( - r'', - webpage, 'render data', default=None) - if not render_data_json: + render_data = self._search_json( + r'', webpage)] post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) @@ -493,14 +509,14 @@ def process_formats(info): def extract_relay_data(_filter): return self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + r'data-sjs>({.*?%s.*?})' % _filter, webpage, 'replay data', default='{}'), video_id, fatal=False) or {} def extract_relay_prefetched_data(_filter): - replay_data = extract_relay_data(_filter) - for require in (replay_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + return traverse_obj(extract_relay_data(_filter), ( + 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ..., + '__bbox', 'result', 'data', {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ @@ -511,7 +527,7 @@ def extract_relay_prefetched_data(_filter): if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') if data: entries = [] @@ -526,7 +542,8 @@ def parse_graphql_video(video): formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', '')): + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): playable_url = video.get(key) if not playable_url: continue From fe371dcf0ba5ce8d42480eade54eeeac99ab3cb0 Mon Sep 17 00:00:00 2001 From: ifan-t Date: Fri, 8 Sep 2023 13:25:43 +0100 Subject: [PATCH 125/218] [ie/S4C] Add series support and extract subs/thumbs (#7776) Authored by: ifan-t --- yt_dlp/extractor/_extractors.py | 5 ++- yt_dlp/extractor/s4c.py | 57 +++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index f11554bdd..b788737a2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1710,7 +1710,10 @@ RuvIE, RuvSpilaIE ) -from .s4c import S4CIE +from .s4c import ( + S4CIE, + S4CSeriesIE +) from .safari import ( SafariIE, SafariApiIE, diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py index 38a905896..990ea2b44 100644 --- a/yt_dlp/extractor/s4c.py +++ b/yt_dlp/extractor/s4c.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import traverse_obj +from ..utils import traverse_obj, url_or_none class S4CIE(InfoExtractor): @@ -11,7 +11,8 @@ class S4CIE(InfoExtractor): 'ext': 'mp4', 'title': 'Y Swn', 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', - 'duration': 5340 + 'duration': 5340, + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg' }, }, { 'url': 'https://www.s4c.cymru/clic/programme/856636948', @@ -21,6 +22,7 @@ class S4CIE(InfoExtractor): 'title': 'Am Dro', 'duration': 2880, 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg' }, }] @@ -30,7 +32,7 @@ def _real_extract(self, url): f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', video_id, fatal=False) - filename = self._download_json( + player_config = self._download_json( 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ 'programme_id': video_id, 'signed': '0', @@ -38,7 +40,13 @@ def _real_extract(self, url): 'mode': 'od', 'appId': 'clic', 'streamName': '', - }, note='Downloading player config JSON')['filename'] + }, note='Downloading player config JSON') + subtitles = {} + for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))): + subtitles.setdefault(sub.get('3', 'en'), []).append({ + 'url': sub['0'], + 'name': sub.get('1'), + }) m3u8_url = self._download_json( 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ 'mode': 'od', @@ -46,17 +54,52 @@ def _real_extract(self, url): 'region': 'WW', 'extra': 'false', 'thirdParty': 'false', - 'filename': filename, + 'filename': player_config['filename'], }, note='Downloading streaming urls JSON')['hls'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, - 'formats': formats, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls'), 'subtitles': subtitles, + 'thumbnail': url_or_none(player_config.get('poster')), **traverse_obj(details, ('full_prog_details', 0, { 'title': (('programme_title', 'series_title'), {str}), 'description': ('full_billing', {str.strip}), 'duration': ('duration', {lambda x: int(x) * 60}), }), get_all=False), } + + +class S4CSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/series/864982911', + 'playlist_mincount': 6, + 'info_dict': { + 'id': '864982911', + 'title': 'Iaith ar Daith', + 'description': 'md5:e878ebf660dce89bd2ef521d7ce06397' + }, + }, { + 'url': 'https://www.s4c.cymru/clic/series/866852587', + 'playlist_mincount': 8, + 'info_dict': { + 'id': '866852587', + 'title': 'FFIT Cymru', + 'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96' + }, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + series_details = self._download_json( + 'https://www.s4c.cymru/df/series_details', series_id, query={ + 'lang': 'e', + 'series_id': series_id, + 'show_prog_in_series': 'Y' + }, note='Downloading series details JSON') + + return self.playlist_result( + [self.url_result(f'https://www.s4c.cymru/clic/programme/{episode_id}', S4CIE, episode_id) + for episode_id in traverse_obj(series_details, ('other_progs_in_series', ..., 'id'))], + series_id, traverse_obj(series_details, ('full_prog_details', 0, 'series_title', {str}))) From 5d0395498d7065aa5e55bac85fa9354b4b0d48eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Szaby=20Gr=C3=BCnwald?= Date: Fri, 8 Sep 2023 14:54:41 +0200 Subject: [PATCH 126/218] [ie/wdr] Fix extraction (#7979) Closes #7461 Authored by: szabyg --- yt_dlp/extractor/wdr.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/wdr.py b/yt_dlp/extractor/wdr.py index de5dc2666..6767f2654 100644 --- a/yt_dlp/extractor/wdr.py +++ b/yt_dlp/extractor/wdr.py @@ -173,6 +173,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'skip': 'HTTP Error 404: Not Found', }, { + # FIXME: Asset JSON is directly embedded in webpage 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { 'id': 'mdb-2296252', @@ -221,6 +222,8 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'id': 'mdb-869971', 'ext': 'mp4', 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'COSMO Livestream', + 'live_status': 'is_live', 'upload_date': '20160101', }, 'params': { @@ -248,6 +251,16 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', 'only_matching': True, }, + { + 'url': 'https://www1.wdr.de/mediathek/video/sendungen/rockpalast/video-baroness---freak-valley-festival--100.html', + 'info_dict': { + 'id': 'mdb-2741028', + 'ext': 'mp4', + 'title': 'Baroness - Freak Valley Festival 2022', + 'alt_title': 'Rockpalast', + 'upload_date': '20220725', + }, + } ] def _real_extract(self, url): @@ -259,7 +272,7 @@ def _real_extract(self, url): # Article with several videos - # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de the data-extension-ard is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus, in a tag with the class "videoButton" (previously a link # to the page in a multiline "videoLink"-tag) @@ -268,7 +281,7 @@ def _real_extract(self, url): (?: (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* - )data-extension=(["\'])(?P(?:(?!\3).)+)\3 + )data-extension(?:-ard)?=(["\'])(?P(?:(?!\3).)+)\3 ''', webpage): media_link_obj = self._parse_json( mobj.group('data'), display_id, transform_source=js_to_json, @@ -295,7 +308,7 @@ def _real_extract(self, url): compat_urlparse.urljoin(url, mobj.group('href')), ie=WDRPageIE.ie_key()) for mobj in re.finditer( - r']+\bhref=(["\'])(?P(?:(?!\1).)+)\1[^>]+\bdata-extension=', + r']+\bhref=(["\'])(?P(?:(?!\1).)+)\1[^>]+\bdata-extension(?:-ard)?=', webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) ] From a006ce2b27357c15792eb5c18f06765e640b801c Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 9 Sep 2023 10:14:49 -0500 Subject: [PATCH 127/218] [ie/twitter] Fix retweet extraction and syndication API (#8016) Authored by: bashonly --- README.md | 2 +- yt_dlp/extractor/twitter.py | 181 ++++++++++++++++++++++++++---------- 2 files changed, 132 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index b82d92a6e..c7b73f4fd 100644 --- a/README.md +++ b/README.md @@ -1854,7 +1854,7 @@ #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` #### twitter -* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed +* `api`: Select one of `graphql` (default), `legacy` or `syndication` as the API for tweet extraction. Has no effect if logged in #### stacommu, wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index f86216f8f..4065acbaa 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,9 +1,10 @@ -import functools import json +import random import re from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE +from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -147,10 +148,14 @@ def _search_dimensions_in_video_url(a_format, video_url): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) + @functools.cached_property + def _selected_api(self): + return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] + def _fetch_guest_token(self, display_id): guest_token = traverse_obj(self._download_json( f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', - headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), + headers=self._set_base_headers(legacy=display_id and self._selected_api == 'legacy')), ('guest_token', {str})) if not guest_token: raise ExtractorError('Could not retrieve guest token') @@ -295,7 +300,7 @@ def input_dict(subtask_id, text): self.report_login() def _call_api(self, path, video_id, query={}, graphql=False): - headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) + headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy') headers.update({ 'x-twitter-auth-type': 'OAuth2Session', 'x-twitter-client-language': 'en', @@ -707,6 +712,7 @@ class TwitterIE(TwitterBaseIE): 'tags': [], 'age_limit': 0, }, + 'skip': 'This Tweet is unavailable', }, { # not available in Periscope 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', @@ -721,6 +727,7 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], + 'skip': 'Broadcast no longer exists', }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', @@ -773,9 +780,9 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛 | #вʟм - Test', + 'title': 'Ultima📛| New Era - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛 | #вʟм', + 'uploader': 'Ultima📛| New Era', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -811,7 +818,7 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, }, }, { - # Adult content, fails if not logged in (GraphQL) + # Adult content, fails if not logged in 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', 'info_dict': { 'id': '1575199163847000068', @@ -831,9 +838,10 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 18, 'tags': [] }, + 'params': {'skip_download': 'The media could not be played'}, 'skip': 'Requires authentication', }, { - # Playlist result only with auth + # Playlist result only with graphql API 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -898,7 +906,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad', + 'description': 'md5:acce559345fd49f129c20dbcda3f1201', 'timestamp': 1658407771, 'release_date': '20220721', 'upload_date': '20220721', @@ -1007,10 +1015,10 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün The Friend Of YWAP', + 'uploader': 'Mün', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -1019,7 +1027,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670306984.0, }, }, { - # url to retweet id w/ legacy api + # retweeted_status (private) 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', @@ -1039,32 +1047,84 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, }, - 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, 'skip': 'Protected tweet', }, { - # orig tweet w/ graphql - 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', + # retweeted_status + 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009', 'info_dict': { - 'id': '1623274794488659969', - 'display_id': '1623739803874349067', + 'id': '1694928337846538240', 'ext': 'mp4', - 'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy', - 'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', - 'uploader': '@selfisekai@hackerspace.pl 🐀', - 'uploader_id': 'liberdalau', - 'uploader_url': 'https://twitter.com/liberdalau', + 'display_id': '1695424220702888009', + 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'description': 'md5:004f2d37fd58737724ec75bc7e679938', + 'uploader': 'Benny Johnson', + 'uploader_id': 'bennyjohnson', + 'uploader_url': 'https://twitter.com/bennyjohnson', 'age_limit': 0, 'tags': [], - 'duration': 8.033, - 'timestamp': 1675964711.0, - 'upload_date': '20230209', - 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', + 'duration': 45.001, + 'timestamp': 1692962814.0, + 'upload_date': '20230825', + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, - 'view_count': int, 'repost_count': int, + 'view_count': int, 'comment_count': int, }, - 'skip': 'Protected tweet', + }, { + # retweeted_status w/ legacy API + 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009', + 'info_dict': { + 'id': '1694928337846538240', + 'ext': 'mp4', + 'display_id': '1695424220702888009', + 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'description': 'md5:004f2d37fd58737724ec75bc7e679938', + 'uploader': 'Benny Johnson', + 'uploader_id': 'bennyjohnson', + 'uploader_url': 'https://twitter.com/bennyjohnson', + 'age_limit': 0, + 'tags': [], + 'duration': 45.001, + 'timestamp': 1692962814.0, + 'upload_date': '20230825', + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'like_count': int, + 'repost_count': int, + }, + 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, + }, { + # Broadcast embedded in tweet + 'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', + 'info_dict': { + 'id': '1yNGaNLjEblJj', + 'ext': 'mp4', + 'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', + 'uploader': 'Jessica Dobson', + 'uploader_id': '1DZEoDwDovRQa', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Animated gif and quote tweet video, with syndication API + 'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1696256659889565950', + 'title': 'BAKOON - https://t.co/zom968d0a0', + 'description': 'https://t.co/zom968d0a0', + 'tags': [], + 'uploader': 'BAKOON', + 'uploader_id': 'BAKKOOONN', + 'uploader_url': 'https://twitter.com/BAKKOOONN', + 'age_limit': 18, + 'timestamp': 1693254077.0, + 'upload_date': '20230828', + 'like_count': int, + }, + 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, + 'expected_warnings': ['Not all metadata'], }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1103,6 +1163,14 @@ class TwitterIE(TwitterBaseIE): 'only_matching': True, }] + _MEDIA_ID_RE = re.compile(r'_video/(\d+)/') + + @property + def _GRAPHQL_ENDPOINT(self): + if self.is_logged_in: + return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail' + return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' + def _graphql_to_legacy(self, data, twid): result = traverse_obj(data, ( 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', @@ -1130,9 +1198,14 @@ def _graphql_to_legacy(self, data, twid): 'user': ('core', 'user_results', 'result', 'legacy'), 'card': ('card', 'legacy'), 'quoted_status': ('quoted_status_result', 'result', 'legacy'), + 'retweeted_status': ('legacy', 'retweeted_status_result', 'result', 'legacy'), }, expected_type=dict, default={})) - # extra transformation is needed since result does not match legacy format + # extra transformations needed since result does not match legacy format + if status.get('retweeted_status'): + status['retweeted_status']['user'] = traverse_obj(status, ( + 'retweeted_status_result', 'result', 'core', 'user_results', 'result', 'legacy', {dict})) or {} + binding_values = { binding_value.get('key'): binding_value.get('value') for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict})) @@ -1208,33 +1281,42 @@ def _build_graphql_query(self, media_id): } def _extract_status(self, twid): - if self.is_logged_in: - return self._graphql_to_legacy( - self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) + if self.is_logged_in or self._selected_api == 'graphql': + status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid) - try: - if not self._configuration_arg('legacy_api'): - return self._graphql_to_legacy( - self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) - return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { + elif self._selected_api == 'legacy': + status = self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, 'include_user_entities': 0, 'tweet_mode': 'extended', - }), 'retweeted_status', None) + }) - except ExtractorError as e: - if e.expected: - raise + elif self._selected_api == 'syndication': self.report_warning( - f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) + 'Not all metadata or media is available via syndication endpoint', twid, only_once=True) + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={ + 'id': twid, + # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') + 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)), + }) + if not status: + raise ExtractorError('Syndication endpoint returned empty JSON response') + # Transform the result so its structure matches that of legacy/graphql + media = [] + for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})): + detail['id_str'] = traverse_obj(detail, ( + 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid + media.append(detail) + status['extended_entities'] = {'media': media} - status = self._download_json( - 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', - headers={'User-Agent': 'Googlebot'}, query={'id': twid}) - status['extended_entities'] = {'media': status.get('mediaDetails')} - return status + else: + raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True) + + return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {} def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') @@ -1266,10 +1348,7 @@ def _real_extract(self, url): } def extract_from_video_info(media): - media_id = traverse_obj(media, 'id_str', 'id', ( - 'video_info', 'variants', ..., 'url', - {functools.partial(re.search, r'_video/(\d+)/')}, 1 - ), get_all=False, expected_type=str_or_none) or twid + media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) self.write_debug(f'Extracting from video info: {media_id}') formats = [] @@ -1503,6 +1582,8 @@ def _real_extract(self, url): broadcast = self._call_api( 'broadcasts/show.json', broadcast_id, {'ids': broadcast_id})['broadcasts'][broadcast_id] + if not broadcast: + raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) media_key = broadcast['media_key'] source = self._call_api( From 66cc64ff6696f9921ff112a278542f8d999ffea4 Mon Sep 17 00:00:00 2001 From: bashonly Date: Mon, 11 Sep 2023 09:51:39 -0500 Subject: [PATCH 128/218] [ie/zoom] Extract duration Closes #8080 Authored by: bashonly --- yt_dlp/extractor/zoom.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index 3d7ccca76..1e41d0434 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -127,6 +127,7 @@ def _real_extract(self, url): return { 'id': video_id, 'title': str_or_none(traverse_obj(data, ('meet', 'topic'))), + 'duration': int_or_none(data.get('duration')), 'subtitles': subtitles, 'formats': formats, 'http_headers': { From 7b71643cc986de9a3768dac4ac9b64f4d05e7f5e Mon Sep 17 00:00:00 2001 From: garret Date: Fri, 15 Sep 2023 18:18:51 +0100 Subject: [PATCH 129/218] [ie/mixcloud] Update API URL (#8114) Closes #8104 Authored by: garret1317 --- yt_dlp/extractor/mixcloud.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/mixcloud.py b/yt_dlp/extractor/mixcloud.py index fb5a08ca2..8a95d1a5d 100644 --- a/yt_dlp/extractor/mixcloud.py +++ b/yt_dlp/extractor/mixcloud.py @@ -20,7 +20,7 @@ class MixcloudBaseIE(InfoExtractor): def _call_api(self, object_type, object_fields, display_id, username, slug=None): lookup_key = object_type + 'Lookup' return self._download_json( - 'https://www.mixcloud.com/graphql', display_id, query={ + 'https://app.mixcloud.com/graphql', display_id, query={ 'query': '''{ %s(lookup: {username: "%s"%s}) { %s @@ -46,7 +46,15 @@ class MixcloudIE(MixcloudBaseIE): 'view_count': int, 'timestamp': 1321359578, 'upload_date': '20111115', + 'uploader_url': 'https://www.mixcloud.com/dholbach/', + 'artist': 'Submorphics & Chino , Telekinesis, Porter Robinson, Enei, Breakage ft Jess Mills', + 'duration': 3723, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/', 'info_dict': { @@ -60,7 +68,14 @@ class MixcloudIE(MixcloudBaseIE): 'view_count': int, 'timestamp': 1422987057, 'upload_date': '20150203', + 'uploader_url': 'https://www.mixcloud.com/gillespeterson/', + 'duration': 2992, + 'tags': [], + 'comment_count': int, + 'repost_count': int, + 'like_count': int, }, + 'params': {'skip_download': '404 playback error on site'}, }, { 'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/', 'only_matching': True, @@ -259,9 +274,9 @@ def _real_extract(self, url): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue - slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + item_slug = try_get(cloudcast, lambda x: x['slug'], compat_str) owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) - video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None + video_id = f'{owner_username}_{item_slug}' if item_slug and owner_username else None entries.append(self.url_result( cloudcast_url, MixcloudIE.ie_key(), video_id)) @@ -284,7 +299,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, }, { @@ -292,7 +307,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, 'playlist_mincount': 36, }, { @@ -300,7 +315,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789', + 'description': 'md5:a3f468a60ac8c3e1f8616380fc469b2b', }, # 'params': { # 'playlist_items': '1-100', @@ -323,9 +338,9 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'FirstEar_stream', 'title': 'First Ear (stream)', - 'description': 'Curators of good music\r\n\r\nfirstearmusic.com', + 'description': 'we maraud for ears', }, - 'playlist_mincount': 271, + 'playlist_mincount': 269, }] _TITLE_KEY = 'displayName' From 497bbbbd7328cb705f70eced94dbd90993819a46 Mon Sep 17 00:00:00 2001 From: SevenLives <410355694@qq.com> Date: Sat, 16 Sep 2023 17:37:04 +0800 Subject: [PATCH 130/218] [ie/abematv] Fix proxy handling (#8046) Fixes https://github.com/yt-dlp/yt-dlp/issues/8036 Authored by: SevenLives --- yt_dlp/extractor/abematv.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 163b83c6d..2a093580c 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -12,7 +12,7 @@ import urllib.request import urllib.response import uuid - +from ..utils.networking import clean_proxies from .common import InfoExtractor from ..aes import aes_ecb_decrypt from ..utils import ( @@ -35,7 +35,10 @@ def add_opener(ydl, handler): # FIXME: Create proper API in .networking rh = ydl._request_director.handlers['Urllib'] if 'abematv-license' in rh._SUPPORTED_URL_SCHEMES: return - opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=ydl.proxies) + headers = ydl.params['http_headers'].copy() + proxies = ydl.proxies.copy() + clean_proxies(proxies, headers) + opener = rh._get_instance(cookiejar=ydl.cookiejar, proxies=proxies) assert isinstance(opener, urllib.request.OpenerDirector) opener.add_handler(handler) rh._SUPPORTED_URL_SCHEMES = (*rh._SUPPORTED_URL_SCHEMES, 'abematv-license') From 578a82e497502b951036ce9da6fe0dac6937ac27 Mon Sep 17 00:00:00 2001 From: Kshitiz Gupta Date: Sat, 16 Sep 2023 15:13:05 +0530 Subject: [PATCH 131/218] [ie/banbye] Support video ids containing a hyphen (#8059) Fixes https://github.com/yt-dlp/yt-dlp/issues/7895 Authored by: kshitiz305 --- yt_dlp/extractor/banbye.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/banbye.py b/yt_dlp/extractor/banbye.py index c87342565..e0fc93b97 100644 --- a/yt_dlp/extractor/banbye.py +++ b/yt_dlp/extractor/banbye.py @@ -31,7 +31,7 @@ def _extract_playlist(self, playlist_id): class BanByeIE(BanByeBaseIE): - _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?banbye.com/(?:en/)?watch/(?P[\w-]+)' _TESTS = [{ 'url': 'https://banbye.com/watch/v_ytfmvkVYLE8T', 'md5': '2f4ea15c5ca259a73d909b2cfd558eb5', @@ -59,7 +59,27 @@ class BanByeIE(BanByeBaseIE): 'title': 'Krzysztof Karoń', 'id': 'p_Ld82N6gBw_OJ', }, - 'playlist_count': 9, + 'playlist_mincount': 9, + }, { + 'url': 'https://banbye.com/watch/v_kb6_o1Kyq-CD', + 'info_dict': { + 'id': 'v_kb6_o1Kyq-CD', + 'ext': 'mp4', + 'title': 'Co tak naprawdę dzieje się we Francji?! Czy Warszawa a potem cała Polska będzie drugim Paryżem?!🤔🇵🇱', + 'description': 'md5:82be4c0e13eae8ea1ca8b9f2e07226a8', + 'uploader': 'Marcin Rola - MOIM ZDANIEM!🇵🇱', + 'channel_id': 'ch_QgWnHvDG2fo5', + 'channel_url': 'https://banbye.com/channel/ch_QgWnHvDG2fo5', + 'duration': 597, + 'timestamp': 1688642656, + 'upload_date': '20230706', + 'thumbnail': 'https://cdn.banbye.com/video/v_kb6_o1Kyq-CD/96.webp', + 'tags': ['Paryż', 'Francja', 'Polska', 'Imigranci', 'Morawiecki', 'Tusk'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'comment_count': int, + }, }] def _real_extract(self, url): From aee6b9b88c0bcccf27fd23b7e00fc0b7b168928f Mon Sep 17 00:00:00 2001 From: barsnick Date: Sat, 16 Sep 2023 12:04:08 +0200 Subject: [PATCH 132/218] [ie/Axs] Add extractor (#8094) Authored by: barsnick --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/axs.py | 87 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 yt_dlp/extractor/axs.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b788737a2..b836fe8a3 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -165,6 +165,7 @@ AWAANLiveIE, AWAANSeasonIE, ) +from .axs import AxsIE from .azmedien import AZMedienIE from .baidu import BaiduVideoIE from .banbye import ( diff --git a/yt_dlp/extractor/axs.py b/yt_dlp/extractor/axs.py new file mode 100644 index 000000000..4b263725f --- /dev/null +++ b/yt_dlp/extractor/axs.py @@ -0,0 +1,87 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + js_to_json, + parse_iso8601, + traverse_obj, + url_or_none, +) + + +class AxsIE(InfoExtractor): + IE_NAME = 'axs.tv' + _VALID_URL = r'https?://(?:www\.)?axs\.tv/(?:channel/(?:[^/?#]+/)+)?video/(?P[^/?#]+)' + + _TESTS = [{ + 'url': 'https://www.axs.tv/video/5f4dc776b70e4f1c194f22ef/', + 'md5': '8d97736ae8e50c64df528e5e676778cf', + 'info_dict': { + 'id': '5f4dc776b70e4f1c194f22ef', + 'title': 'Small Town', + 'ext': 'mp4', + 'description': 'md5:e314d28bfaa227a4d7ec965fae19997f', + 'upload_date': '20230602', + 'timestamp': 1685729564, + 'duration': 1284.216, + 'series': 'Rock & Roll Road Trip with Sammy Hagar', + 'season': 2, + 'episode': '3', + 'thumbnail': 'https://images.dotstudiopro.com/5f4e9d330a0c3b295a7e8394', + }, + }, { + 'url': 'https://www.axs.tv/channel/rock-star-interview/video/daryl-hall', + 'md5': '300ae795cd8f9984652c0949734ffbdc', + 'info_dict': { + 'id': '5f488148b70e4f392572977c', + 'display_id': 'daryl-hall', + 'title': 'Daryl Hall', + 'ext': 'mp4', + 'description': 'md5:e54ecaa0f4b5683fc9259e9e4b196628', + 'upload_date': '20230214', + 'timestamp': 1676403615, + 'duration': 2570.668, + 'series': 'The Big Interview with Dan Rather', + 'season': 3, + 'episode': '5', + 'thumbnail': 'https://images.dotstudiopro.com/5f4d1901f340b50d937cec32', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + webpage_json_data = self._search_json( + r'mountObj\s*=', webpage, 'video ID data', display_id, + transform_source=js_to_json) + video_id = webpage_json_data['video_id'] + company_id = webpage_json_data['company_id'] + + meta = self._download_json( + f'https://api.myspotlight.tv/dotplayer/video/{company_id}/{video_id}', + video_id, query={'device_type': 'desktop_web'})['video'] + + formats = self._extract_m3u8_formats( + meta['video_m3u8'], video_id, 'mp4', m3u8_id='hls') + + subtitles = {} + for cc in traverse_obj(meta, ('closeCaption', lambda _, v: url_or_none(v['srtPath']))): + subtitles.setdefault(cc.get('srtShortLang') or 'en', []).append( + {'ext': cc.get('srtExt'), 'url': cc['srtPath']}) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + **traverse_obj(meta, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'series': ('seriestitle', {str}), + 'season': ('season', {int}), + 'episode': ('episode', {str}), + 'duration': ('duration', {float_or_none}), + 'timestamp': ('updated_at', {parse_iso8601}), + 'thumbnail': ('thumb', {url_or_none}), + }), + 'subtitles': subtitles, + } From 6e07e4bc7e59f5bdb60e93c011e57b18b009f2b5 Mon Sep 17 00:00:00 2001 From: zhallgato Date: Sat, 16 Sep 2023 12:12:18 +0200 Subject: [PATCH 133/218] [ie/mediaklikk] Fix extractor (#8086) Fixes https://github.com/yt-dlp/yt-dlp/issues/8053 Authored by: bashonly, zhallgato --- yt_dlp/extractor/mediaklikk.py | 72 ++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index 46365081b..fcc4827b5 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -1,5 +1,8 @@ from ..utils import ( - unified_strdate + ExtractorError, + traverse_obj, + unified_strdate, + url_or_none, ) from .common import InfoExtractor from ..compat import ( @@ -15,7 +18,7 @@ class MediaKlikkIE(InfoExtractor): (?P[^/#?_]+)''' _TESTS = [{ - # mediaklikk. date in html. + # (old) mediaklikk. date in html. 'url': 'https://mediaklikk.hu/video/hazajaro-delnyugat-bacska-a-duna-menten-palankatol-doroszloig/', 'info_dict': { 'id': '4754129', @@ -23,9 +26,21 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20210901', 'thumbnail': 'http://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' + }, + 'skip': 'Webpage redirects to 404 page', + }, { + # mediaklikk. date in html. + 'url': 'https://mediaklikk.hu/video/hazajaro-fabova-hegyseg-kishont-koronaja/', + 'info_dict': { + 'id': '6696133', + 'title': 'Hazajáró, Fabova-hegység - Kishont koronája', + 'display_id': 'hazajaro-fabova-hegyseg-kishont-koronaja', + 'ext': 'mp4', + 'upload_date': '20230903', + 'thumbnail': 'https://mediaklikk.hu/wp-content/uploads/sites/4/2014/02/hazajarouj_JO.jpg' } }, { - # m4sport + # (old) m4sport 'url': 'https://m4sport.hu/video/2021/08/30/gyemant-liga-parizs/', 'info_dict': { 'id': '4754999', @@ -33,6 +48,18 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20210830', 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/08/vlcsnap-2021-08-30-18h21m20s10-1024x576.jpg' + }, + 'skip': 'Webpage redirects to 404 page', + }, { + # m4sport + 'url': 'https://m4sport.hu/sportkozvetitesek/video/2023/09/08/atletika-gyemant-liga-brusszel/', + 'info_dict': { + 'id': '6711136', + 'title': 'Atlétika – Gyémánt Liga, Brüsszel', + 'display_id': 'atletika-gyemant-liga-brusszel', + 'ext': 'mp4', + 'upload_date': '20230908', + 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-08-22h43m18s691.jpg' } }, { # m4sport with *video/ url and no date @@ -40,20 +67,33 @@ class MediaKlikkIE(InfoExtractor): 'info_dict': { 'id': '4492099', 'title': 'Real Madrid - Chelsea 1-1', + 'display_id': 'real-madrid-chelsea-1-1', 'ext': 'mp4', - 'thumbnail': 'http://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' + 'thumbnail': 'https://m4sport.hu/wp-content/uploads/sites/4/2021/04/Sequence-01.Still001-1024x576.png' } }, { - # hirado + # (old) hirado 'url': 'https://hirado.hu/videok/felteteleket-szabott-a-fovaros/', 'info_dict': { 'id': '4760120', 'title': 'Feltételeket szabott a főváros', 'ext': 'mp4', 'thumbnail': 'http://hirado.hu/wp-content/uploads/sites/4/2021/09/vlcsnap-2021-09-01-20h20m37s165.jpg' + }, + 'skip': 'Webpage redirects to video list page', + }, { + # hirado + 'url': 'https://hirado.hu/belfold/video/2023/09/11/marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal', + 'info_dict': { + 'id': '6716068', + 'title': 'Marad az éves elszámolás a napelemekre beruházó családoknál', + 'display_id': 'marad-az-eves-elszamolas-a-napelemekre-beruhazo-csaladoknal', + 'ext': 'mp4', + 'upload_date': '20230911', + 'thumbnail': 'https://hirado.hu/wp-content/uploads/sites/4/2023/09/vlcsnap-2023-09-11-09h16m09s882.jpg' } }, { - # petofilive + # (old) petofilive 'url': 'https://petofilive.hu/video/2021/06/07/tha-shudras-az-akusztikban/', 'info_dict': { 'id': '4571948', @@ -61,6 +101,18 @@ class MediaKlikkIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20210607', 'thumbnail': 'http://petofilive.hu/wp-content/uploads/sites/4/2021/06/vlcsnap-2021-06-07-22h14m23s915-1024x576.jpg' + }, + 'skip': 'Webpage redirects to empty page', + }, { + # petofilive + 'url': 'https://petofilive.hu/video/2023/09/09/futball-fesztival-a-margitszigeten/', + 'info_dict': { + 'id': '6713233', + 'title': 'Futball Fesztivál a Margitszigeten', + 'display_id': 'futball-fesztival-a-margitszigeten', + 'ext': 'mp4', + 'upload_date': '20230909', + 'thumbnail': 'https://petofilive.hu/wp-content/uploads/sites/4/2023/09/Clipboard11-2.jpg' } }] @@ -84,8 +136,12 @@ def _real_extract(self, url): player_data['video'] = player_data.pop('token') player_page = self._download_webpage('https://player.mediaklikk.hu/playernew/player.php', video_id, query=player_data) - playlist_url = self._proto_relative_url(compat_urllib_parse_unquote( - self._html_search_regex(r'\"file\":\s*\"(\\?/\\?/.*playlist\.m3u8)\"', player_page, 'playlist_url')).replace('\\/', '/')) + player_json = self._search_json( + r'\bpl\.setup\s*\(', player_page, 'player json', video_id, end_pattern=r'\);') + playlist_url = traverse_obj( + player_json, ('playlist', lambda _, v: v['type'] == 'hls', 'file', {url_or_none}), get_all=False) + if not playlist_url: + raise ExtractorError('Unable to extract playlist url') formats = self._extract_wowza_formats( playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) From 98eac0e6ba0e510ae7dfdfd249d42ee71fb272b1 Mon Sep 17 00:00:00 2001 From: hatsomatt <143712404+hatsomatt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:02:37 +0200 Subject: [PATCH 134/218] [ie/videa] Fix extraction (#8003) Closes #7427 Authored by: hatsomatt, aky-01 Co-authored-by: aky-01 <65510015+aky-01@users.noreply.github.com> --- yt_dlp/extractor/videa.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/videa.py b/yt_dlp/extractor/videa.py index 59ae933b0..634d2edea 100644 --- a/yt_dlp/extractor/videa.py +++ b/yt_dlp/extractor/videa.py @@ -38,6 +38,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/videok/origo/jarmuvek/supercars-elozes-jAHDWfWSJH5XuFhH', @@ -48,6 +49,7 @@ class VideaIE(InfoExtractor): 'title': 'Supercars előzés', 'thumbnail': r're:^https?://.*', 'duration': 64, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player?v=8YfIAjxwWGwT8HVQ', @@ -58,6 +60,7 @@ class VideaIE(InfoExtractor): 'title': 'Az őrült kígyász 285 kígyót enged szabadon', 'thumbnail': r're:^https?://.*', 'duration': 21, + 'age_limit': 0, }, }, { 'url': 'http://videa.hu/player/v/8YfIAjxwWGwT8HVQ?autoplay=1', @@ -124,7 +127,7 @@ def _real_extract(self, url): query['_t'] = result[:16] b64_info, handle = self._download_webpage_handle( - 'http://videa.hu/videaplayer_get_xml.php', video_id, query=query) + 'http://videa.hu/player/xml', video_id, query=query) if b64_info.startswith(' Date: Sat, 16 Sep 2023 16:24:11 +0200 Subject: [PATCH 135/218] [ie/TV5MondePlus] Fix extractor (#7952) Closes #4978 Authored by: korli, dirkf --- yt_dlp/extractor/tv5mondeplus.py | 98 ++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/tv5mondeplus.py b/yt_dlp/extractor/tv5mondeplus.py index bd0be784d..4da1b26d1 100644 --- a/yt_dlp/extractor/tv5mondeplus.py +++ b/yt_dlp/extractor/tv5mondeplus.py @@ -1,10 +1,14 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( determine_ext, extract_attributes, int_or_none, parse_duration, + traverse_obj, try_get, + url_or_none, ) @@ -12,6 +16,36 @@ class TV5MondePlusIE(InfoExtractor): IE_DESC = 'TV5MONDE+' _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P[^/?#]+)' _TESTS = [{ + # movie + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/les-novices', + 'md5': 'c86f60bf8b75436455b1b205f9745955', + 'info_dict': { + 'id': 'ZX0ipMyFQq_6D4BA7b', + 'display_id': 'les-novices', + 'ext': 'mp4', + 'title': 'Les novices', + 'description': 'md5:2e7c33ba3ad48dabfcc2a956b88bde2b', + 'upload_date': '20230821', + 'thumbnail': 'https://revoir.tv5monde.com/uploads/media/video_thumbnail/0738/60/01e952b7ccf36b7c6007ec9131588954ab651de9.jpeg', + 'duration': 5177, + 'episode': 'Les novices', + }, + }, { + # series episode + 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/opj-les-dents-de-la-terre-2', + 'info_dict': { + 'id': 'wJ0eeEPozr_6D4BA7b', + 'display_id': 'opj-les-dents-de-la-terre-2', + 'ext': 'mp4', + 'title': "OPJ - Les dents de la Terre (2)", + 'description': 'md5:288f87fd68d993f814e66e60e5302d9d', + 'upload_date': '20230823', + 'series': 'OPJ', + 'episode': 'Les dents de la Terre (2)', + 'duration': 2877, + 'thumbnail': 'https://dl-revoir.tv5monde.com/images/1a/5753448.jpg' + }, + }, { # movie 'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/ceux-qui-travaillent', 'md5': '32fa0cde16a4480d1251502a66856d5f', @@ -23,6 +57,7 @@ class TV5MondePlusIE(InfoExtractor): 'description': 'md5:570e8bb688036ace873b2d50d24c026d', 'upload_date': '20210819', }, + 'skip': 'no longer available', }, { # series episode 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/vestiaires-caro-actrice', @@ -39,6 +74,7 @@ class TV5MondePlusIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'no longer available', }, { 'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver', 'only_matching': True, @@ -63,20 +99,45 @@ def _real_extract(self, url): video_files = self._parse_json( vpl_data['data-broadcast'], display_id) formats = [] - for video_file in video_files: - v_url = video_file.get('url') - if not v_url: - continue - video_format = video_file.get('format') or determine_ext(v_url) - if video_format == 'm3u8': - formats.extend(self._extract_m3u8_formats( - v_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': v_url, - 'format_id': video_format, - }) + video_id = None + + def process_video_files(v): + nonlocal video_id + for video_file in v: + v_url = video_file.get('url') + if not v_url: + continue + if video_file.get('type') == 'application/deferred': + d_param = urllib.parse.quote(v_url) + token = video_file.get('token') + if not token: + continue + deferred_json = self._download_json( + f'https://api.tv5monde.com/player/asset/{d_param}/resolve?condenseKS=true', display_id, + note='Downloading deferred info', headers={'Authorization': f'Bearer {token}'}, fatal=False) + v_url = traverse_obj(deferred_json, (0, 'url', {url_or_none})) + if not v_url: + continue + # data-guid from the webpage isn't stable, use the material id from the json urls + video_id = self._search_regex( + r'materials/([\da-zA-Z]{10}_[\da-fA-F]{7})/', v_url, 'video id', default=None) + process_video_files(deferred_json) + + video_format = video_file.get('format') or determine_ext(v_url) + if video_format == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif video_format == 'mpd': + formats.extend(self._extract_mpd_formats( + v_url, display_id, fatal=False)) + else: + formats.append({ + 'url': v_url, + 'format_id': video_format, + }) + + process_video_files(video_files) metadata = self._parse_json( vpl_data['data-metadata'], display_id) @@ -100,10 +161,11 @@ def _real_extract(self, url): if upload_date: upload_date = upload_date.replace('_', '') - video_id = self._search_regex( - (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', - r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', - default=display_id) + if not video_id: + video_id = self._search_regex( + (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})', + r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id', + default=display_id) return { 'id': video_id, From f659e6439444ac64305b5c80688cd82f59d2279c Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 16 Sep 2023 17:50:06 +0200 Subject: [PATCH 136/218] [ie/bpb] Overhaul extractor (#8119) Authored by: Grub4K --- yt_dlp/extractor/bpb.py | 174 +++++++++++++++++++++++++++++++++------- yt_dlp/utils/_utils.py | 1 + 2 files changed, 145 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/bpb.py b/yt_dlp/extractor/bpb.py index f28e581b8..7fe089944 100644 --- a/yt_dlp/extractor/bpb.py +++ b/yt_dlp/extractor/bpb.py @@ -1,56 +1,170 @@ +import functools import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, + get_element_text_and_html_by_tag, + get_elements_by_class, + join_nonempty, js_to_json, - determine_ext, + mimetype2ext, + unified_strdate, + url_or_none, + urljoin, + variadic, ) +from ..utils.traversal import traverse_obj + + +def html_get_element(tag=None, cls=None): + assert tag or cls, 'One of tag or class is required' + + if cls: + func = functools.partial(get_elements_by_class, cls, tag=tag) + else: + func = functools.partial(get_element_text_and_html_by_tag, tag) + + def html_get_element_wrapper(html): + return variadic(func(html))[0] + + return html_get_element_wrapper class BpbIE(InfoExtractor): IE_DESC = 'Bundeszentrale für politische Bildung' - _VALID_URL = r'https?://(?:www\.)?bpb\.de/mediathek/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.|m\.)?bpb\.de/(?:[^/?#]+/)*(?P\d+)(?:[/?#]|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', 'ext': 'mp4', + 'creator': 'Kooperative Berlin', + 'description': 'md5:f4f75885ba009d3e2b156247a8941ce6', + 'release_date': '20160115', + 'series': 'Interview auf dem Geschichtsforum 1989 | 2009', + 'tags': ['Friedliche Revolution', 'Erinnerungskultur', 'Vergangenheitspolitik', 'DDR 1949 - 1990', 'Freiheitsrecht', 'BStU', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/7/297_teaser_16x9_1240.jpg?8839D', 'title': 'Joachim Gauck zu 1989 und die Erinnerung an die DDR', - 'description': 'Joachim Gauck, erster Beauftragter für die Stasi-Unterlagen, spricht auf dem Geschichtsforum über die friedliche Revolution 1989 und eine "gewisse Traurigkeit" im Umgang mit der DDR-Vergangenheit.' + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/video/522184/krieg-flucht-und-falschmeldungen-wirstattdesinformation-2/', + 'info_dict': { + 'id': '522184', + 'ext': 'mp4', + 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'description': 'md5:f83c795ff8f825a69456a9e51fc15903', + 'release_date': '20230621', + 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], + 'thumbnail': 'https://www.bpb.de/cache/images/4/522184_teaser_16x9_1240.png?EABFB', + 'title': 'md5:9b01ccdbf58dbf9e5c9f6e771a803b1c', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/lernen/bewegtbild-und-politische-bildung/webvideo/518789/krieg-flucht-und-falschmeldungen-wirstattdesinformation-1/', + 'info_dict': { + 'id': '518789', + 'ext': 'mp4', + 'creator': 'Institute for Strategic Dialogue Germany gGmbH (ISD)', + 'description': 'md5:85228aed433e84ff0ff9bc582abd4ea8', + 'release_date': '20230302', + 'tags': ['Desinformation', 'Ukraine', 'Russland', 'Geflüchtete'], + 'thumbnail': 'https://www.bpb.de/cache/images/9/518789_teaser_16x9_1240.jpeg?56D0D', + 'title': 'md5:3e956f264bb501f6383f10495a401da4', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/podcasts/apuz-podcast/539727/apuz-20-china/', + 'only_matching': True, + }, { + 'url': 'https://www.bpb.de/mediathek/audio/315813/folge-1-eine-einfuehrung/', + 'info_dict': { + 'id': '315813', + 'ext': 'mp3', + 'creator': 'Axel Schröder', + 'description': 'md5:eda9d1af34e5912efef5baf54fba4427', + 'release_date': '20200921', + 'series': 'Auf Endlagersuche. Der deutsche Weg zu einem sicheren Atommülllager', + 'tags': ['Atomenergie', 'Endlager', 'hoch-radioaktiver Abfall', 'Endlagersuche', 'Atommüll', 'Atomendlager', 'Gorleben', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/3/315813_teaser_16x9_1240.png?92A94', + 'title': 'Folge 1: Eine Einführung', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/517806/die-weltanschauung-der-neuen-rechten/', + 'info_dict': { + 'id': '517806', + 'ext': 'mp3', + 'creator': 'Bundeszentrale für politische Bildung', + 'description': 'md5:594689600e919912aade0b2871cc3fed', + 'release_date': '20230127', + 'series': 'Vorträge des Fachtags "Modernisierer. Grenzgänger. Anstifter. Sechs Jahrzehnte \'Neue Rechte\'"', + 'tags': ['Rechtsextremismus', 'Konservatismus', 'Konservativismus', 'neue Rechte', 'Rechtspopulismus', 'Schnellroda', 'Deutschland'], + 'thumbnail': 'https://www.bpb.de/cache/images/6/517806_teaser_16x9_1240.png?7A7A0', + 'title': 'Die Weltanschauung der "Neuen Rechten"', + 'uploader': 'Bundeszentrale für politische Bildung', + }, + }, { + 'url': 'https://www.bpb.de/mediathek/reihen/zahlen-und-fakten-soziale-situation-filme/520153/zahlen-und-fakten-die-soziale-situation-in-deutschland-migration/', + 'only_matching': True, + }] + + _TITLE_RE = re.compile('(?P[^<]*)<[^>]+>(?P<series>[^<]*)') + + def _parse_vue_attributes(self, name, string, video_id): + attributes = extract_attributes(self._search_regex(rf'(<{name}(?:"[^"]*?"|[^>])*>)', string, name)) + + for key, value in attributes.items(): + if key.startswith(':'): + attributes[key] = self._parse_json(value, video_id, transform_source=js_to_json, fatal=False) + + return attributes + + @staticmethod + def _process_source(source): + url = url_or_none(source['src']) + if not url: + return None + + source_type = source.get('type', '') + extension = mimetype2ext(source_type) + is_video = source_type.startswith('video') + note = url.rpartition('.')[0].rpartition('_')[2] if is_video else None + + return { + 'url': url, + 'ext': extension, + 'vcodec': None if is_video else 'none', + 'quality': 10 if note == 'high' else 0, + 'format_note': note, + 'format_id': join_nonempty(extension, note), } - } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._html_search_regex( - r'<h2 class="white">(.*?)</h2>', webpage, 'title') - video_info_dicts = re.findall( - r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) - - formats = [] - for video_info in video_info_dicts: - video_info = self._parse_json( - video_info, video_id, transform_source=js_to_json, fatal=False) - if not video_info: - continue - video_url = video_info.get('src') - if not video_url: - continue - quality = 'high' if '_high' in video_url else 'low' - formats.append({ - 'url': video_url, - 'quality': 10 if quality == 'high' else 0, - 'format_note': quality, - 'format_id': '%s-%s' % (quality, determine_ext(video_url)), - }) + title_result = traverse_obj(webpage, ({html_get_element(cls='opening-header__title')}, {self._TITLE_RE.match})) + json_lds = list(self._yield_json_ld(webpage, video_id, fatal=False)) return { 'id': video_id, - 'formats': formats, - 'title': title, - 'description': self._og_search_description(webpage), + 'title': traverse_obj(title_result, ('title', {str.strip})) or None, + # This metadata could be interpreted otherwise, but it fits "series" the most + 'series': traverse_obj(title_result, ('series', {str.strip})) or None, + 'description': join_nonempty(*traverse_obj(webpage, [( + {html_get_element(cls='opening-intro')}, + [{html_get_element(tag='bpb-accordion-item')}, {html_get_element(cls='text-content')}], + ), {clean_html}]), delim='\n\n') or None, + 'creator': self._html_search_meta('author', webpage), + 'uploader': self._html_search_meta('publisher', webpage), + 'release_date': unified_strdate(self._html_search_meta('date', webpage)), + 'tags': traverse_obj(json_lds, (..., 'keywords', {lambda x: x.split(',')}, ...)), + **traverse_obj(self._parse_vue_attributes('bpb-player', webpage, video_id), { + 'formats': (':sources', ..., {self._process_source}), + 'thumbnail': ('poster', {lambda x: urljoin(url, x)}), + }), } diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index f5552ce80..180bec245 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2847,6 +2847,7 @@ def mimetype2ext(mt, default=NO_DEFAULT): 'quicktime': 'mov', 'webm': 'webm', 'vp9': 'vp9', + 'video/ogg': 'ogv', 'x-flv': 'flv', 'x-m4v': 'm4v', 'x-matroska': 'mkv', From 069cbece9dba6384f1cc5fcfc7ce562a31af42fc Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 13:28:14 -0500 Subject: [PATCH 137/218] [ie/tiktok] Fix webpage extraction Closes #8089 Authored by: bashonly --- yt_dlp/extractor/tiktok.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index f14c4f9d6..f26972cff 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -15,7 +15,6 @@ UserNotLive, determine_ext, format_field, - get_element_by_id, get_first, int_or_none, join_nonempty, @@ -50,8 +49,9 @@ def _create_url(user_id, video_id): return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' def _get_sigi_state(self, webpage, display_id): - return self._parse_json(get_element_by_id( - 'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id) + return self._search_json( + r'<script[^>]+\bid="(?:SIGI_STATE|sigi-persisted-data)"[^>]*>', webpage, + 'sigi state', display_id, end_pattern=r'</script>') def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): From cebbd33b1c678149fc8f0e254db6fc0da317ea80 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:43:12 -0400 Subject: [PATCH 138/218] [ie/twitcasting] Improve `_VALID_URL` (#8120) Closes #7597 Authored by: c-basalt --- yt_dlp/extractor/twitcasting.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index dff353a4f..3890d5d8f 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -22,7 +22,7 @@ class TwitCastingIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/(?:movie|twplayer)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<uploader_id>[^/?#]+)/(?:movie|twplayer)/(?P<id>\d+)' _M3U8_HEADERS = { 'Origin': 'https://twitcasting.tv', 'Referer': 'https://twitcasting.tv/', @@ -231,7 +231,7 @@ def find_dmu(x): class TwitCastingLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://twitcasting.tv/ivetesangalo', 'only_matching': True, @@ -265,8 +265,15 @@ def _real_extract(self, url): class TwitCastingUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<id>[^/]+)/show/?(?:[#?]|$)' + _VALID_URL = r'https?://(?:[^/?#]+\.)?twitcasting\.tv/(?P<id>[^/?#]+)/(:?show|archive)/?(?:[#?]|$)' _TESTS = [{ + 'url': 'https://twitcasting.tv/natsuiromatsuri/archive/', + 'info_dict': { + 'id': 'natsuiromatsuri', + 'title': 'natsuiromatsuri - Live History', + }, + 'playlist_mincount': 235, + }, { 'url': 'https://twitcasting.tv/noriyukicas/show', 'only_matching': True, }] From 9bf14be775289bd88cc1f5c89fd761ae51879484 Mon Sep 17 00:00:00 2001 From: makeworld <25111343+makew0rld@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:49:43 -0400 Subject: [PATCH 139/218] [ie/cbc] Ignore any 426 from API (#7689) Closes #7477 Authored by: makew0rld --- yt_dlp/extractor/cbc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index b3c5471f7..2920b9027 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -339,12 +339,12 @@ def _new_claims_token(self, email, password): data = json.dumps({'jwt': sig}).encode() headers = {'content-type': 'application/json', 'ott-device-type': 'web'} resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token', - None, data=data, headers=headers) + None, data=data, headers=headers, expected_status=426) cbc_access_token = resp['accessToken'] headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token} resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile', - None, headers=headers) + None, headers=headers, expected_status=426) return resp['claimsToken'] def _get_claims_token_expiry(self): From 5336bf57a7061e0955a37f0542fc8ebf50d55b17 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 16 Sep 2023 16:53:57 -0400 Subject: [PATCH 140/218] [ie/bilibili] Extract `format_id` (#7555) Authored by: c-basalt --- yt_dlp/extractor/bilibili.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index cb7ab2a17..290340078 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -3,6 +3,7 @@ import hashlib import itertools import math +import re import time import urllib.parse @@ -38,6 +39,8 @@ class BilibiliBaseIE(InfoExtractor): + _FORMAT_ID_RE = re.compile(r'-(\d+)\.m4s\?') + def extract_formats(self, play_info): format_names = { r['quality']: traverse_obj(r, 'new_description', 'display_desc') @@ -54,7 +57,8 @@ def extract_formats(self, play_info): 'acodec': audio.get('codecs'), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), - 'filesize': int_or_none(audio.get('size')) + 'filesize': int_or_none(audio.get('size')), + 'format_id': str_or_none(audio.get('id')), } for audio in audios] formats.extend({ @@ -68,6 +72,9 @@ def extract_formats(self, play_info): 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), + 'format_id': traverse_obj( + video, (('baseUrl', 'base_url'), {self._FORMAT_ID_RE.search}, 1), + ('id', {str_or_none}), get_all=False), 'format': format_names.get(video.get('id')), } for video in traverse_obj(play_info, ('dash', 'video', ...))) From 9d376c4daeaf1279a011582f3f0e6ae42af520dd Mon Sep 17 00:00:00 2001 From: Aniruddh Joshi <aniruddh@ebincoweb.com> Date: Sun, 17 Sep 2023 02:28:21 +0530 Subject: [PATCH 141/218] [ie/AmazonMiniTV] Fix extractor (#8103) Closes #7817 Authored by: Aniruddh-J --- yt_dlp/extractor/amazonminitv.py | 63 +++++--------------------------- 1 file changed, 9 insertions(+), 54 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index b57d985d1..ad23b16bd 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -37,7 +37,7 @@ def _call_api(self, asin, data=None, note=None): return resp['data'][data['operationName']] -class AmazonMiniTVIE(AmazonMiniTVBaseIE): +class AmazonMiniTVIE(InfoExtractor): _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P<id>[a-f0-9-]+)' _TESTS = [{ 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', @@ -86,56 +86,14 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE): 'only_matching': True, }] - _GRAPHQL_QUERY_CONTENT = ''' -query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { - content( - applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} - contentId: $contentId - contentType: $contentType - ) { - contentId - name - ... on Episode { - contentId - vodType - name - images - description { - synopsis - contentLengthInSeconds - } - publicReleaseDateUTC - audioTracks - seasonId - seriesId - seriesName - seasonNumber - episodeNumber - timecode { - endCreditsTime - } - } - ... on MovieContent { - contentId - vodType - name - description { - synopsis - contentLengthInSeconds - } - images - publicReleaseDateUTC - audioTracks - } - } -}''' - def _real_extract(self, url): - asin = f'amzn1.dv.gti.{self._match_id(url)}' - prs = self._call_api(asin, note='Downloading playback info') + video_uuid = self._match_id(url) + asin = f'amzn1.dv.gti.{video_uuid}' + webpage = self._download_webpage(f'https://www.amazon.in/minitv/tp/{video_uuid}', asin) + data = self._search_nextjs_data(webpage, asin)['props']['pageProps']['ssrProps'] formats, subtitles = [], {} - for type_, asset in prs['playbackAssets'].items(): + for type_, asset in traverse_obj(data, ('playbackData', 'playbackAssets', {dict.items}, ...)): if not traverse_obj(asset, 'manifestUrl'): continue if type_ == 'hls': @@ -152,12 +110,7 @@ def _real_extract(self, url): else: self.report_warning(f'Unknown asset type: {type_}') - title_info = self._call_api( - asin, note='Downloading title info', data={ - 'operationName': 'content', - 'variables': {'contentId': asin}, - 'query': self._GRAPHQL_QUERY_CONTENT, - }) + title_info = traverse_obj(data, ('contentData', {dict})) or {} credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) is_episode = title_info.get('vodType') == 'EPISODE' @@ -192,6 +145,7 @@ class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' + _WORKING = False _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -251,6 +205,7 @@ class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' + _WORKING = False _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, From a83da3717d30697102e76f63a6f29d77f9373c2a Mon Sep 17 00:00:00 2001 From: ApoorvShah111 <79164543+ApoorvShah111@users.noreply.github.com> Date: Sun, 17 Sep 2023 02:31:26 +0530 Subject: [PATCH 142/218] [ie/nitter] Fix title extraction fallback (#8102) Closes #7575 Authored by: ApoorvShah111 --- yt_dlp/extractor/nitter.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/nitter.py b/yt_dlp/extractor/nitter.py index 5d1ca1f5d..35d1311dc 100644 --- a/yt_dlp/extractor/nitter.py +++ b/yt_dlp/extractor/nitter.py @@ -265,6 +265,26 @@ class NitterIE(InfoExtractor): 'repost_count': int, 'comment_count': int, } + }, { # no OpenGraph title + 'url': f'https://{current_instance}/LocalBateman/status/1678455464038735895#m', + 'info_dict': { + 'id': '1678455464038735895', + 'ext': 'mp4', + 'title': 'Your Typical Local Man - Local man, what did Romanians ever do to you?', + 'description': 'Local man, what did Romanians ever do to you?', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Your Typical Local Man', + 'uploader_id': 'LocalBateman', + 'uploader_url': f'https://{current_instance}/LocalBateman', + 'upload_date': '20230710', + 'timestamp': 1689009900, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': {'skip_download': 'm3u8'}, } ] @@ -292,7 +312,7 @@ def _real_extract(self, url): 'ext': ext }] - title = description = self._og_search_description(full_webpage) or self._html_search_regex( + title = description = self._og_search_description(full_webpage, default=None) or self._html_search_regex( r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False) uploader_id = self._html_search_regex( From ecef42c3adbcb6a84405139047923c4967316f28 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Sun, 17 Sep 2023 05:04:10 +0800 Subject: [PATCH 143/218] [ie/zaiko] Improve thumbnail extraction (#8054) Authored by: pzhlkj6612 --- yt_dlp/extractor/zaiko.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/zaiko.py b/yt_dlp/extractor/zaiko.py index 0ccacbb6a..2b6221da2 100644 --- a/yt_dlp/extractor/zaiko.py +++ b/yt_dlp/extractor/zaiko.py @@ -9,6 +9,7 @@ traverse_obj, try_call, unescapeHTML, + url_basename, url_or_none, ) @@ -45,12 +46,14 @@ class ZaikoIE(ZaikoBaseIE): 'uploader_id': '454', 'uploader': 'ZAIKO ZERO', 'release_timestamp': 1583809200, - 'thumbnail': r're:https://[a-z0-9]+.cloudfront.net/[a-z0-9_]+/[a-z0-9_]+', + 'thumbnail': r're:^https://[\w.-]+/\w+/\w+', + 'thumbnails': 'maxcount:2', 'release_date': '20200310', 'categories': ['Tech House'], 'live_status': 'was_live', }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'Your account does not have tickets to this event', }] def _real_extract(self, url): @@ -83,6 +86,12 @@ def _real_extract(self, url): if not formats: self.raise_no_formats(msg, expected=expected) + thumbnail_urls = [ + traverse_obj(player_meta, ('initial_event_info', 'poster_url')), + self._og_search_thumbnail(self._download_webpage( + f'https://zaiko.io/event/{video_id}', video_id, 'Downloading event page', fatal=False) or ''), + ] + return { 'id': video_id, 'formats': formats, @@ -96,8 +105,8 @@ def _real_extract(self, url): }), **traverse_obj(player_meta, ('initial_event_info', { 'alt_title': ('title', {str}), - 'thumbnail': ('poster_url', {url_or_none}), })), + 'thumbnails': [{'url': url, 'id': url_basename(url)} for url in thumbnail_urls if url_or_none(url)] } From 0ce1f48bf1cb78d40d734ce73ee1c90eccf92274 Mon Sep 17 00:00:00 2001 From: 04-pasha-04 <89145825+04-pasha-04@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:06:00 +0200 Subject: [PATCH 144/218] [ie/funker530] Fix extraction (#8040) Authored by: 04-pasha-04 --- yt_dlp/extractor/funker530.py | 1 + yt_dlp/extractor/rumble.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/funker530.py b/yt_dlp/extractor/funker530.py index ba5ab7d4e..62fd7f6dd 100644 --- a/yt_dlp/extractor/funker530.py +++ b/yt_dlp/extractor/funker530.py @@ -60,6 +60,7 @@ class Funker530IE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + info = {} rumble_url = list(RumbleEmbedIE._extract_embed_urls(url, webpage)) if rumble_url: info = {'url': rumble_url[0], 'ie_key': RumbleEmbedIE.ie_key()} diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index f8bf4a182..96c192581 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -144,7 +144,7 @@ def _extract_embed_urls(cls, url, webpage): if embeds: return embeds return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( - r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{\s*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] + r'<script>[^<]*\bRumble\(\s*"play"\s*,\s*{[^}]*[\'"]?video[\'"]?\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) From 23d829a3420450bcfb0788e6fb2cf4f6acdbe596 Mon Sep 17 00:00:00 2001 From: Tristan Lee <lee.tristan.evans@gmail.com> Date: Sat, 16 Sep 2023 16:08:15 -0500 Subject: [PATCH 145/218] [ie/Rumble] Fix embed extraction (#8035) Authored by: trislee --- yt_dlp/extractor/rumble.py | 59 ++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 25 deletions(-) diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 96c192581..85567d9a2 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -33,7 +33,7 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', 'channel_url': 'https://rumble.com/c/WMAR', 'channel': 'WMAR', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.qR4e-small-WMAR-2-News-Latest-Headline.jpg', 'duration': 234, 'uploader': 'WMAR', 'live_status': 'not_live', @@ -84,7 +84,7 @@ class RumbleEmbedIE(InfoExtractor): 'info_dict': { 'id': 'v1essrt', 'ext': 'mp4', - 'title': 'startswith:lofi hip hop radio - beats to relax/study', + 'title': 'startswith:lofi hip hop radio 📚 - beats to relax/study to', 'timestamp': 1661519399, 'upload_date': '20220826', 'channel_url': 'https://rumble.com/c/LofiGirl', @@ -99,7 +99,7 @@ class RumbleEmbedIE(InfoExtractor): 'url': 'https://rumble.com/embed/v1amumr', 'info_dict': { 'id': 'v1amumr', - 'ext': 'webm', + 'ext': 'mp4', 'fps': 60, 'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live', 'timestamp': 1658518457, @@ -129,7 +129,7 @@ class RumbleEmbedIE(InfoExtractor): 'duration': 92, 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', 'channel_url': 'https://rumble.com/c/RichSementa', - 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.qR4e-small-911-Audio-From-The-Man-Who-.jpg', 'timestamp': 1654892716, 'uploader': 'Mr Producer Media', 'upload_date': '20220610', @@ -236,7 +236,9 @@ def _real_extract(self, url): class RumbleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rumble\.com/(?P<id>v(?!ideos)[\w.-]+)[^/]*$' - _EMBED_REGEX = [r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>'] + _EMBED_REGEX = [ + r'<a class=video-item--a href=(?P<url>/v[\w.-]+\.html)>', + r'<a[^>]+class="videostream__link link"[^>]+href=(?P<url>/v[\w.-]+\.html)[^>]*>'] _TESTS = [{ 'add_ie': ['RumbleEmbed'], 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', @@ -254,6 +256,7 @@ class RumbleIE(InfoExtractor): 'thumbnail': r're:https://.+\.jpg', 'duration': 103, 'like_count': int, + 'dislike_count': int, 'view_count': int, 'live_status': 'not_live', } @@ -278,6 +281,9 @@ class RumbleIE(InfoExtractor): 'channel_url': 'https://rumble.com/c/Redacted', 'live_status': 'not_live', 'thumbnail': 'https://sp.rmbl.ws/s8/1/d/x/2/O/dx2Oi.qR4e-small-The-U.S.-CANNOT-hide-this-i.jpg', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, }, }, { 'url': 'https://rumble.com/v2e7fju-the-covid-twitter-files-drop-protecting-fauci-while-censoring-the-truth-wma.html', @@ -296,12 +302,15 @@ class RumbleIE(InfoExtractor): 'channel_url': 'https://rumble.com/c/KimIversen', 'channel': 'Kim Iversen', 'thumbnail': 'https://sp.rmbl.ws/s8/1/6/b/w/O/6bwOi.qR4e-small-The-Covid-Twitter-Files-Dro.jpg', + 'like_count': int, + 'dislike_count': int, + 'view_count': int, }, }] _WEBPAGE_TESTS = [{ 'url': 'https://rumble.com/videos?page=2', - 'playlist_count': 25, + 'playlist_mincount': 24, 'info_dict': { 'id': 'videos?page=2', 'title': 'All videos', @@ -309,17 +318,16 @@ class RumbleIE(InfoExtractor): 'age_limit': 0, }, }, { - 'url': 'https://rumble.com/live-videos', - 'playlist_mincount': 19, + 'url': 'https://rumble.com/browse/live', + 'playlist_mincount': 25, 'info_dict': { - 'id': 'live-videos', - 'title': 'Live Videos', - 'description': 'Live videos on Rumble.com', + 'id': 'live', + 'title': 'Browse', 'age_limit': 0, }, }, { 'url': 'https://rumble.com/search/video?q=rumble&sort=views', - 'playlist_count': 24, + 'playlist_mincount': 24, 'info_dict': { 'id': 'video?q=rumble&sort=views', 'title': 'Search results for: rumble', @@ -334,19 +342,20 @@ def _real_extract(self, url): if not url_info: raise UnsupportedError(url) - release_ts_str = self._search_regex( - r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', - webpage, 'release date', fatal=False, default=None) - view_count_str = self._search_regex(r'<span class="media-heading-info">([\d,]+) Views', - webpage, 'view count', fatal=False, default=None) - - return self.url_result( - url_info['url'], ie_key=url_info['ie_key'], url_transparent=True, - view_count=parse_count(view_count_str), - release_timestamp=parse_iso8601(release_ts_str), - like_count=parse_count(get_element_by_class('rumbles-count', webpage)), - description=clean_html(get_element_by_class('media-description', webpage)), - ) + return { + '_type': 'url_transparent', + 'ie_key': url_info['ie_key'], + 'url': url_info['url'], + 'release_timestamp': parse_iso8601(self._search_regex( + r'(?:Livestream begins|Streamed on):\s+<time datetime="([^"]+)', webpage, 'release date', default=None)), + 'view_count': int_or_none(self._search_regex( + r'"userInteractionCount"\s*:\s*(\d+)', webpage, 'view count', default=None)), + 'like_count': parse_count(self._search_regex( + r'<span data-js="rumbles_up_votes">\s*([\d,.KM]+)', webpage, 'like count', default=None)), + 'dislike_count': parse_count(self._search_regex( + r'<span data-js="rumbles_down_votes">\s*([\d,.KM]+)', webpage, 'dislike count', default=None)), + 'description': clean_html(get_element_by_class('media-description', webpage)) + } class RumbleChannelIE(InfoExtractor): From b4c1c408c63724339eb12b16c91b253a7ee62cfa Mon Sep 17 00:00:00 2001 From: barsnick <barsnick@users.noreply.github.com> Date: Sat, 16 Sep 2023 23:11:05 +0200 Subject: [PATCH 146/218] [ie/Bild.de] Extract HLS formats (#8032) Closes #7951 Authored by: barsnick --- yt_dlp/extractor/bild.py | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/bild.py b/yt_dlp/extractor/bild.py index f3dea33c4..eb289329d 100644 --- a/yt_dlp/extractor/bild.py +++ b/yt_dlp/extractor/bild.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + traverse_obj, unescapeHTML, ) @@ -8,7 +9,8 @@ class BildIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bild\.de/(?:[^/]+/)+(?P<display_id>[^/]+)-(?P<id>\d+)(?:,auto=true)?\.bild\.html' IE_DESC = 'Bild.de' - _TEST = { + _TESTS = [{ + 'note': 'static MP4 only', 'url': 'http://www.bild.de/video/clip/apple-ipad-air/das-koennen-die-neuen-ipads-38184146.bild.html', 'md5': 'dd495cbd99f2413502a1713a1156ac8a', 'info_dict': { @@ -19,7 +21,19 @@ class BildIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 196, } - } + }, { + 'note': 'static MP4 and HLS', + 'url': 'https://www.bild.de/video/clip/news-ausland/deftiger-abgang-vom-10m-turm-bademeister-sorgt-fuer-skandal-85158620.bild.html', + 'md5': 'fb0ed4f09c495d4ba7ce2eee0bb90de1', + 'info_dict': { + 'id': '85158620', + 'ext': 'mp4', + 'title': 'Der Sprungturm-Skandal', + 'description': 'md5:709b543c24dc31bbbffee73bccda34ad', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 69, + } + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -27,11 +41,23 @@ def _real_extract(self, url): video_data = self._download_json( url.split('.bild.html')[0] + ',view=json.bild.html', video_id) + formats = [] + for src in traverse_obj(video_data, ('clipList', 0, 'srces', lambda _, v: v['src'])): + src_type = src.get('type') + if src_type == 'application/x-mpegURL': + formats.extend( + self._extract_m3u8_formats( + src['src'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + elif src_type == 'video/mp4': + formats.append({'url': src['src'], 'format_id': 'http-mp4'}) + else: + self.report_warning(f'Skipping unsupported format type: "{src_type}"') + return { 'id': video_id, 'title': unescapeHTML(video_data['title']).strip(), 'description': unescapeHTML(video_data.get('description')), - 'url': video_data['clipList'][0]['srces'][0]['src'], + 'formats': formats, 'thumbnail': video_data.get('poster'), 'duration': int_or_none(video_data.get('durationSec')), } From 5be7e978867b5f66ad6786c674d79d40e950ae16 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Sat, 16 Sep 2023 17:13:04 -0400 Subject: [PATCH 147/218] [ie/sohu] Fix extractor (#7628) Closes #1667, Closes #7463 Authored by: c-basalt, bashonly --- yt_dlp/extractor/_extractors.py | 5 +- yt_dlp/extractor/sohu.py | 107 ++++++++++++++++++++++++++++++-- 2 files changed, 105 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b836fe8a3..4fed6d66a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1795,7 +1795,10 @@ from .slutload import SlutloadIE from .smotrim import SmotrimIE from .snotr import SnotrIE -from .sohu import SohuIE +from .sohu import ( + SohuIE, + SohuVIE, +) from .sonyliv import ( SonyLIVIE, SonyLIVSeriesIE, diff --git a/yt_dlp/extractor/sohu.py b/yt_dlp/extractor/sohu.py index a8f1e4623..c0ff4f9aa 100644 --- a/yt_dlp/extractor/sohu.py +++ b/yt_dlp/extractor/sohu.py @@ -1,3 +1,4 @@ +import base64 import re from .common import InfoExtractor @@ -8,7 +9,12 @@ from ..utils import ( ExtractorError, int_or_none, + float_or_none, + url_or_none, + unified_timestamp, try_get, + urljoin, + traverse_obj, ) @@ -31,13 +37,20 @@ class SohuIE(InfoExtractor): 'id': '409385080', 'ext': 'mp4', 'title': '《2015湖南卫视羊年元宵晚会》唐嫣《花好月圆》', - } + }, + 'skip': 'no longer available', }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', 'info_dict': { 'id': '78693464', 'ext': 'mp4', 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + 'uploader': '爱范儿视频', + 'duration': 213, + 'timestamp': 1425519600, + 'upload_date': '20150305', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg', + 'tags': ['爱范儿', '爱范品', 'MWC', '手机'], } }, { 'note': 'Multipart video', @@ -45,6 +58,12 @@ class SohuIE(InfoExtractor): 'info_dict': { 'id': '78910339', 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + 'uploader': '小苍cany', + 'duration': 744.0, + 'timestamp': 1426269360, + 'upload_date': '20150313', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg', + 'tags': ['小苍MM', '英雄联盟', '实战秘籍'], }, 'playlist': [{ 'info_dict': { @@ -75,6 +94,11 @@ class SohuIE(InfoExtractor): 'id': '78932792', 'ext': 'mp4', 'title': 'youtube-dl testing video', + 'duration': 360, + 'timestamp': 1426348620, + 'upload_date': '20150314', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M02/8A/00/MTAuMTAuODguNzk=/6_14cee1be192g102SysCutcloud_78932792_7_7b.jpg', + 'tags': [], }, 'params': { 'skip_download': True @@ -100,7 +124,7 @@ def _fetch_data(vid_id, mytv=False): webpage = self._download_webpage(url, video_id) - title = re.sub(r' - 搜狐视频$', '', self._og_search_title(webpage)) + title = re.sub(r'( - 高清正版在线观看)? - 搜狐视频$', '', self._og_search_title(webpage)) vid = self._html_search_regex( r'var vid ?= ?["\'](\d+)["\']', @@ -132,7 +156,9 @@ def _fetch_data(vid_id, mytv=False): allot = format_data['allot'] data = format_data['data'] - clips_url = data['clipsURL'] + clip_url = traverse_obj(data, (('clipsURL', 'mp4PlayUrl'), i, {url_or_none}), get_all=False) + if not clip_url: + raise ExtractorError(f'Unable to extract url for clip {i}') su = data['su'] video_url = 'newflv.sohu.ccgslb.net' @@ -142,9 +168,9 @@ def _fetch_data(vid_id, mytv=False): while 'newflv.sohu.ccgslb.net' in video_url: params = { 'prot': 9, - 'file': clips_url[i], + 'file': clip_url, 'new': su[i], - 'prod': 'flash', + 'prod': 'h5n', 'rb': 1, } @@ -193,6 +219,75 @@ def _fetch_data(vid_id, mytv=False): 'entries': playlist, 'id': video_id, 'title': title, + 'duration': traverse_obj(vid_data, ('data', 'totalDuration', {float_or_none})), } - return info + if mytv: + publish_time = unified_timestamp(self._search_regex( + r'publishTime:\s*["\'](\d+-\d+-\d+ \d+:\d+)["\']', webpage, 'publish time', fatal=False)) + else: + publish_time = traverse_obj(vid_data, ('tv_application_time', {unified_timestamp})) + + return { + 'timestamp': publish_time - 8 * 3600 if publish_time else None, + **traverse_obj(vid_data, { + 'alt_title': ('data', 'subName', {str}), + 'uploader': ('wm_data', 'wm_username', {str}), + 'thumbnail': ('data', 'coverImg', {url_or_none}), + 'tags': ('data', 'tag', {str.split}), + }), + **info, + } + + +class SohuVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.sohu\.com/v/(?P<id>[\w=-]+)\.html(?:$|[#?])' + + _TESTS = [{ + 'note': 'Multipart video', + 'url': 'https://tv.sohu.com/v/MjAyMzA2MTQvbjYwMTMxNTE5Mi5zaHRtbA==.html', + 'info_dict': { + 'id': '601315192', + 'title': '《淬火丹心》第1集', + 'alt_title': '“点天灯”发生事故', + 'duration': 2701.692, + 'timestamp': 1686758040, + 'upload_date': '20230614', + 'thumbnail': 'http://photocdn.tv.sohu.com/img/20230614/vrsa_hor_1686738763256_454010551.jpg', + }, + 'playlist_mincount': 9, + 'skip': 'Only available in China', + }, { + 'url': 'https://tv.sohu.com/v/dXMvMjMyNzk5ODg5Lzc4NjkzNDY0LnNodG1s.html', + 'info_dict': { + 'id': '78693464', + 'ext': 'mp4', + 'title': '【爱范品】第31期:MWC见不到的奇葩手机', + 'uploader': '爱范儿视频', + 'duration': 213, + 'timestamp': 1425519600, + 'upload_date': '20150305', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M10/83/FA/MTAuMTAuODguODA=/6_14cbccdde5eg104SysCutcloud_78693464_7_0b.jpg', + 'tags': ['爱范儿', '爱范品', 'MWC', '手机'], + } + }, { + 'note': 'Multipart video', + 'url': 'https://tv.sohu.com/v/dXMvMjQyNTYyMTYzLzc4OTEwMzM5LnNodG1s.html?src=pl', + 'info_dict': { + 'id': '78910339', + 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', + 'uploader': '小苍cany', + 'duration': 744.0, + 'timestamp': 1426269360, + 'upload_date': '20150313', + 'thumbnail': 'http://e3f49eaa46b57.cdn.sohucs.com//group1/M11/89/57/MTAuMTAuODguODA=/6_14cea022a1dg102SysCutcloud_78910339_8_0b.jpg', + 'tags': ['小苍MM', '英雄联盟', '实战秘籍'], + }, + 'playlist_mincount': 3, + }] + + def _real_extract(self, url): + encoded_id = self._match_id(url) + path = base64.urlsafe_b64decode(encoded_id).decode() + subdomain = 'tv' if re.match(r'\d+/n\d+\.shtml', path) else 'my.tv' + return self.url_result(urljoin(f'http://{subdomain}.sohu.com/', path), SohuIE) From 308936619c8a4f3a52d73c829c2006ff6c55fea2 Mon Sep 17 00:00:00 2001 From: fireattack <human.peng@gmail.com> Date: Sun, 17 Sep 2023 05:18:04 +0800 Subject: [PATCH 148/218] [ie/facebook] Improve format sorting (#8074) Authored by: fireattack --- yt_dlp/extractor/facebook.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index c30a6b06a..50a750d3b 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -505,7 +505,6 @@ def process_formats(info): # with non-browser User-Agent. for f in info['formats']: f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - info['_format_sort_fields'] = ('res', 'quality') def extract_relay_data(_filter): return self._parse_json(self._search_regex( @@ -552,7 +551,8 @@ def parse_graphql_video(video): else: formats.append({ 'format_id': format_id, - 'quality': q(format_id), + # sd, hd formats w/o resolution info should be deprioritized below DASH + 'quality': q(format_id) - 3, 'url': playable_url, }) extract_dash_manifest(video, formats) @@ -719,9 +719,11 @@ def parse_attachment(attachment, key='media'): for src_type in ('src', 'src_no_ratelimit'): src = f[0].get('%s_%s' % (quality, src_type)) if src: - preference = -10 if format_id == 'progressive' else -1 + # sd, hd formats w/o resolution info should be deprioritized below DASH + # TODO: investigate if progressive or src formats still exist + preference = -10 if format_id == 'progressive' else -3 if quality == 'hd': - preference += 5 + preference += 1 formats.append({ 'format_id': '%s_%s_%s' % (format_id, quality, src_type), 'url': src, From 53675852195d8dd859555d4789944a6887171ff8 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 16:20:34 -0500 Subject: [PATCH 149/218] [ie/generic] Fix KVS thumbnail extraction Closes #8045 Authored by: bashonly --- yt_dlp/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f5c59a093..33e71d1c5 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2370,7 +2370,7 @@ def _extract_kvs(self, url, webpage, video_id): 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': urljoin(url, thumbnail), 'formats': formats, } From 635ae31f68a3ac7f6393d59657ed711e34ee3552 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 16:22:21 -0500 Subject: [PATCH 150/218] [ie/mediastream] Make embed extraction non-fatal Authored by: bashonly --- yt_dlp/extractor/mediastream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index cef769f29..d5c9aab8a 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -14,7 +14,7 @@ class MediaStreamBaseIE(InfoExtractor): _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' def _extract_mediastream_urls(self, webpage): - yield from traverse_obj(list(self._yield_json_ld(webpage, None)), ( + yield from traverse_obj(list(self._yield_json_ld(webpage, None, fatal=False)), ( lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) From 20c3c9b433dd47faf0dbde6b46e4e34eb76109a5 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 16:23:54 -0500 Subject: [PATCH 151/218] [ie/reddit] Extract subtitles Closes #7814 Authored by: bashonly --- yt_dlp/extractor/reddit.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index 813e62874..62f669f35 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -319,16 +319,20 @@ def add_thumbnail(src): 'format_id': 'fallback', 'format_note': 'DASH video, mp4_dash', }] - formats.extend(self._extract_m3u8_formats( - hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_mpd_formats( - dash_playlist_url, display_id, mpd_id='dash', fatal=False)) + hls_fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) + dash_fmts, dash_subs = self._extract_mpd_formats_and_subtitles( + dash_playlist_url, display_id, mpd_id='dash', fatal=False) + formats.extend(dash_fmts) + self._merge_subtitles(dash_subs, target=subtitles) return { **info, 'id': video_id, 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, 'duration': int_or_none(reddit_video.get('duration')), } From eda0e415d26eb084e570cf5372d38ee1f616b70f Mon Sep 17 00:00:00 2001 From: garret <garret1317@yandex.com> Date: Sat, 16 Sep 2023 23:47:49 +0100 Subject: [PATCH 152/218] [ie/bbc] Extract tracklist as chapters (#7788) Authored by: garret1317 --- yt_dlp/extractor/bbc.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index a55cdef2b..d1d6e04fa 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -15,11 +15,13 @@ float_or_none, get_element_by_class, int_or_none, + join_nonempty, js_to_json, parse_duration, parse_iso8601, parse_qs, strip_or_none, + traverse_obj, try_get, unescapeHTML, unified_timestamp, @@ -41,7 +43,6 @@ class BBCCoUkIE(InfoExtractor): iplayer(?:/[^/]+)?/(?:episode/|playlist/)| music/(?:clips|audiovideo/popular)[/#]| radio/player/| - sounds/play/| events/[^/]+/play/[^/]+/ ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) @@ -218,20 +219,6 @@ class BBCCoUkIE(InfoExtractor): # rtmp download 'skip_download': True, }, - }, { - 'url': 'https://www.bbc.co.uk/sounds/play/m0007jzb', - 'note': 'Audio', - 'info_dict': { - 'id': 'm0007jz9', - 'ext': 'mp4', - 'title': 'BBC Proms, 2019, Prom 34: West–Eastern Divan Orchestra', - 'description': "Live BBC Proms. West–Eastern Divan Orchestra with Daniel Barenboim and Martha Argerich.", - 'duration': 9840, - }, - 'params': { - # rtmp download - 'skip_download': True, - } }, { 'url': 'http://www.bbc.co.uk/iplayer/playlist/p01dvks4', 'only_matching': True, @@ -844,6 +831,20 @@ class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE 'upload_date': '20190604', 'categories': ['Psychology'], }, + }, { + # BBC Sounds + 'url': 'https://www.bbc.co.uk/sounds/play/m001q78b', + 'info_dict': { + 'id': 'm001q789', + 'ext': 'mp4', + 'title': 'The Night Tracks Mix - Music for the darkling hour', + 'thumbnail': 'https://ichef.bbci.co.uk/images/ic/raw/p0c00hym.jpg', + 'chapters': 'count:8', + 'description': 'md5:815fb51cbdaa270040aab8145b3f1d67', + 'uploader': 'Radio 3', + 'duration': 1800, + 'uploader_id': 'bbc_radio_three', + }, }, { # onion routes 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', 'only_matching': True, @@ -1128,6 +1129,13 @@ def _real_extract(self, url): 'uploader_id': network.get('id'), 'formats': formats, 'subtitles': subtitles, + 'chapters': traverse_obj(preload_state, ( + 'tracklist', 'tracks', lambda _, v: float_or_none(v['offset']['start']), { + 'title': ('titles', {lambda x: join_nonempty( + 'primary', 'secondary', 'tertiary', delim=' - ', from_dict=x)}), + 'start_time': ('offset', 'start', {float_or_none}), + 'end_time': ('offset', 'end', {float_or_none}), + })) or None, } bbc3_config = self._parse_json( From 2da7bcca16fdb40d4bdb2746643ba1a603771382 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 18:57:14 -0500 Subject: [PATCH 153/218] Revert 9d376c4daeaf1279a011582f3f0e6ae42af520dd Authored by: bashonly --- yt_dlp/extractor/amazonminitv.py | 63 +++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index ad23b16bd..b57d985d1 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -37,7 +37,7 @@ def _call_api(self, asin, data=None, note=None): return resp['data'][data['operationName']] -class AmazonMiniTVIE(InfoExtractor): +class AmazonMiniTVIE(AmazonMiniTVBaseIE): _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P<id>[a-f0-9-]+)' _TESTS = [{ 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', @@ -86,14 +86,56 @@ class AmazonMiniTVIE(InfoExtractor): 'only_matching': True, }] + _GRAPHQL_QUERY_CONTENT = ''' +query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { + content( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + contentId: $contentId + contentType: $contentType + ) { + contentId + name + ... on Episode { + contentId + vodType + name + images + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + audioTracks + seasonId + seriesId + seriesName + seasonNumber + episodeNumber + timecode { + endCreditsTime + } + } + ... on MovieContent { + contentId + vodType + name + description { + synopsis + contentLengthInSeconds + } + images + publicReleaseDateUTC + audioTracks + } + } +}''' + def _real_extract(self, url): - video_uuid = self._match_id(url) - asin = f'amzn1.dv.gti.{video_uuid}' - webpage = self._download_webpage(f'https://www.amazon.in/minitv/tp/{video_uuid}', asin) - data = self._search_nextjs_data(webpage, asin)['props']['pageProps']['ssrProps'] + asin = f'amzn1.dv.gti.{self._match_id(url)}' + prs = self._call_api(asin, note='Downloading playback info') formats, subtitles = [], {} - for type_, asset in traverse_obj(data, ('playbackData', 'playbackAssets', {dict.items}, ...)): + for type_, asset in prs['playbackAssets'].items(): if not traverse_obj(asset, 'manifestUrl'): continue if type_ == 'hls': @@ -110,7 +152,12 @@ def _real_extract(self, url): else: self.report_warning(f'Unknown asset type: {type_}') - title_info = traverse_obj(data, ('contentData', {dict})) or {} + title_info = self._call_api( + asin, note='Downloading title info', data={ + 'operationName': 'content', + 'variables': {'contentId': asin}, + 'query': self._GRAPHQL_QUERY_CONTENT, + }) credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) is_episode = title_info.get('vodType') == 'EPISODE' @@ -145,7 +192,6 @@ class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:season' _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Season, "minitv:season:" prefix' - _WORKING = False _TESTS = [{ 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', 'playlist_mincount': 6, @@ -205,7 +251,6 @@ class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): IE_NAME = 'amazonminitv:series' _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' IE_DESC = 'Amazon MiniTV Series, "minitv:series:" prefix' - _WORKING = False _TESTS = [{ 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', 'playlist_mincount': 3, From 538d37671a17e0782d17f08df17800e2e3bd57c8 Mon Sep 17 00:00:00 2001 From: bashonly <bashonly@bashonly.com> Date: Sat, 16 Sep 2023 19:03:30 -0500 Subject: [PATCH 154/218] [ie/AmazonMiniTV] Fix extractors Closes #7817 Authored by: GautamMKGarg, bashonly Co-authored by: GautamMKGarg <GautamMKgarg@gmail.com> --- yt_dlp/extractor/amazonminitv.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/amazonminitv.py b/yt_dlp/extractor/amazonminitv.py index b57d985d1..2c71c5ef5 100644 --- a/yt_dlp/extractor/amazonminitv.py +++ b/yt_dlp/extractor/amazonminitv.py @@ -22,8 +22,11 @@ def _call_api(self, asin, data=None, note=None): resp = self._download_json( f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', - asin, note=note, headers={'Content-Type': 'application/json'}, - data=json.dumps(data).encode() if data else None, + asin, note=note, headers={ + 'Content-Type': 'application/json', + 'currentpageurl': '/', + 'currentplatform': 'dWeb' + }, data=json.dumps(data).encode() if data else None, query=None if data else { 'deviceType': 'A1WMMUXPCUJL4N', 'contentId': asin, @@ -46,7 +49,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE): 'ext': 'mp4', 'title': 'May I Kiss You?', 'language': 'Hindi', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'description': 'md5:a549bfc747973e04feb707833474e59d', 'release_timestamp': 1644710400, 'release_date': '20220213', @@ -68,7 +71,7 @@ class AmazonMiniTVIE(AmazonMiniTVBaseIE): 'ext': 'mp4', 'title': 'Jahaan', 'language': 'Hindi', - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'description': 'md5:05eb765a77bf703f322f120ec6867339', 'release_timestamp': 1647475200, 'release_date': '20220317', From 9652bca1bd02f6bc1b8cb1e186f2ccbf32225561 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 16 Sep 2023 19:38:09 -0500 Subject: [PATCH 155/218] [ie/web.archive:vlive] Remove extractor (#8132) Closes #8122 Authored by: bashonly --- yt_dlp/extractor/_extractors.py | 1 - yt_dlp/extractor/archiveorg.py | 235 -------------------------------- yt_dlp/extractor/naver.py | 2 +- 3 files changed, 1 insertion(+), 237 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 4fed6d66a..bf0c67542 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -122,7 +122,6 @@ from .archiveorg import ( ArchiveOrgIE, YoutubeWebArchiveIE, - VLiveWebArchiveIE, ) from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index 2541cd6fd..a0b26ac5a 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -3,7 +3,6 @@ import urllib.parse from .common import InfoExtractor -from .naver import NaverBaseIE from .youtube import YoutubeBaseInfoExtractor, YoutubeIE from ..compat import compat_urllib_parse_unquote from ..networking import HEADRequest @@ -947,237 +946,3 @@ def _real_extract(self, url): if not info.get('title'): info['title'] = video_id return info - - -class VLiveWebArchiveIE(InfoExtractor): - IE_NAME = 'web.archive:vlive' - IE_DESC = 'web.archive.org saved vlive videos' - _VALID_URL = r'''(?x) - (?:https?://)?web\.archive\.org/ - (?:web/)?(?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional - (?:https?(?::|%3[Aa])//)?(?: - (?:(?:www|m)\.)?vlive\.tv(?::(?:80|443))?/(?:video|embed)/(?P<id>[0-9]+) # VLive URL - ) - ''' - _TESTS = [{ - 'url': 'https://web.archive.org/web/20221221144331/http://www.vlive.tv/video/1326', - 'md5': 'cc7314812855ce56de70a06a27314983', - 'info_dict': { - 'id': '1326', - 'ext': 'mp4', - 'title': "Girl's Day's Broadcast", - 'creator': "Girl's Day", - 'view_count': int, - 'uploader_id': 'muploader_a', - 'uploader_url': None, - 'uploader': None, - 'upload_date': '20150817', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1439816449, - 'like_count': int, - 'channel': 'Girl\'s Day', - 'channel_id': 'FDF27', - 'comment_count': int, - 'release_timestamp': 1439818140, - 'release_date': '20150817', - 'duration': 1014, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://web.archive.org/web/20221221182103/http://www.vlive.tv/video/16937', - 'info_dict': { - 'id': '16937', - 'ext': 'mp4', - 'title': '첸백시 걍방', - 'creator': 'EXO', - 'view_count': int, - 'subtitles': 'mincount:12', - 'uploader_id': 'muploader_j', - 'uploader_url': 'http://vlive.tv', - 'uploader': None, - 'upload_date': '20161112', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1478923074, - 'like_count': int, - 'channel': 'EXO', - 'channel_id': 'F94BD', - 'comment_count': int, - 'release_timestamp': 1478924280, - 'release_date': '20161112', - 'duration': 906, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870', - 'info_dict': { - 'id': '101870', - 'ext': 'mp4', - 'title': '[ⓓ xV] “레벨이들 매력에 반해? 안 반해?” 움직이는 HD 포토 (레드벨벳:Red Velvet)', - 'creator': 'Dispatch', - 'view_count': int, - 'subtitles': 'mincount:6', - 'uploader_id': 'V__FRA08071', - 'uploader_url': 'http://vlive.tv', - 'uploader': None, - 'upload_date': '20181130', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': 1543601327, - 'like_count': int, - 'channel': 'Dispatch', - 'channel_id': 'C796F3', - 'comment_count': int, - 'release_timestamp': 1543601040, - 'release_date': '20181130', - 'duration': 279, - }, - 'params': { - 'skip_download': True, - }, - }] - - # The wayback machine has special timestamp and "mode" values: - # timestamp: - # 1 = the first capture - # 2 = the last capture - # mode: - # id_ = Identity - perform no alterations of the original resource, return it as it was archived. - _WAYBACK_BASE_URL = 'https://web.archive.org/web/2id_/' - - def _download_archived_page(self, url, video_id, *, timestamp='2', **kwargs): - for retry in self.RetryManager(): - try: - return self._download_webpage(f'https://web.archive.org/web/{timestamp}id_/{url}', video_id, **kwargs) - except ExtractorError as e: - if isinstance(e.cause, HTTPError) and e.cause.status == 404: - raise ExtractorError('Page was not archived', expected=True) - retry.error = e - continue - - def _download_archived_json(self, url, video_id, **kwargs): - page = self._download_archived_page(url, video_id, **kwargs) - if not page: - raise ExtractorError('Page was not archived', expected=True) - else: - return self._parse_json(page, video_id) - - def _extract_formats_from_m3u8(self, m3u8_url, params, video_id): - m3u8_doc = self._download_archived_page(m3u8_url, video_id, note='Downloading m3u8', query=params, fatal=False) - if not m3u8_doc: - return - - # M3U8 document should be changed to archive domain - m3u8_doc = m3u8_doc.splitlines() - url_base = m3u8_url.rsplit('/', 1)[0] - first_segment = None - for i, line in enumerate(m3u8_doc): - if not line.startswith('#'): - m3u8_doc[i] = f'{self._WAYBACK_BASE_URL}{url_base}/{line}?{urllib.parse.urlencode(params)}' - first_segment = first_segment or m3u8_doc[i] - - # Segments may not have been archived. See https://web.archive.org/web/20221127190050/http://www.vlive.tv/video/101870 - urlh = self._request_webpage(HEADRequest(first_segment), video_id, errnote=False, - fatal=False, note='Check first segment availablity') - if urlh: - formats, subtitles = self._parse_m3u8_formats_and_subtitles('\n'.join(m3u8_doc), ext='mp4', video_id=video_id) - if subtitles: - self._report_ignoring_subs('m3u8') - return formats - - # Closely follows the logic of the ArchiveTeam grab script - # See: https://github.com/ArchiveTeam/vlive-grab/blob/master/vlive.lua - def _real_extract(self, url): - video_id, url_date = self._match_valid_url(url).group('id', 'date') - - webpage = self._download_archived_page(f'https://www.vlive.tv/video/{video_id}', video_id, timestamp=url_date) - - player_info = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'player info', video_id) - user_country = traverse_obj(player_info, ('common', 'userCountry')) - - main_script_url = self._search_regex(r'<script\s+src="([^"]+/js/main\.[^"]+\.js)"', webpage, 'main script url') - main_script = self._download_archived_page(main_script_url, video_id, note='Downloading main script') - app_id = self._search_regex(r'appId\s*=\s*"([^"]+)"', main_script, 'app id') - - inkey = self._download_archived_json( - f'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/{video_id}/inkey', video_id, note='Fetching inkey', query={ - 'appId': app_id, - 'platformType': 'PC', - 'gcc': user_country, - 'locale': 'en_US', - }, fatal=False) - - vod_id = traverse_obj(player_info, ('postDetail', 'post', 'officialVideo', 'vodId')) - - vod_data = self._download_archived_json( - f'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{vod_id}', video_id, note='Fetching vod data', query={ - 'key': inkey.get('inkey'), - 'pid': 'rmcPlayer_16692457559726800', # partially unix time and partially random. Fixed value used by archiveteam project - 'sid': '2024', - 'ver': '2.0', - 'devt': 'html5_pc', - 'doct': 'json', - 'ptc': 'https', - 'sptc': 'https', - 'cpt': 'vtt', - 'ctls': '%7B%22visible%22%3A%7B%22fullscreen%22%3Atrue%2C%22logo%22%3Afalse%2C%22playbackRate%22%3Afalse%2C%22scrap%22%3Afalse%2C%22playCount%22%3Atrue%2C%22commentCount%22%3Atrue%2C%22title%22%3Atrue%2C%22writer%22%3Atrue%2C%22expand%22%3Afalse%2C%22subtitles%22%3Atrue%2C%22thumbnails%22%3Atrue%2C%22quality%22%3Atrue%2C%22setting%22%3Atrue%2C%22script%22%3Afalse%2C%22logoDimmed%22%3Atrue%2C%22badge%22%3Atrue%2C%22seekingTime%22%3Atrue%2C%22muted%22%3Atrue%2C%22muteButton%22%3Afalse%2C%22viewerNotice%22%3Afalse%2C%22linkCount%22%3Afalse%2C%22createTime%22%3Afalse%2C%22thumbnail%22%3Atrue%7D%2C%22clicked%22%3A%7B%22expand%22%3Afalse%2C%22subtitles%22%3Afalse%7D%7D', - 'pv': '4.26.9', - 'dr': '1920x1080', - 'cpl': 'en_US', - 'lc': 'en_US', - 'adi': '%5B%7B%22type%22%3A%22pre%22%2C%22exposure%22%3Afalse%2C%22replayExposure%22%3Afalse%7D%5D', - 'adu': '%2F', - 'videoId': vod_id, - 'cc': user_country, - }) - - formats = [] - - streams = traverse_obj(vod_data, ('streams', ...)) - if len(streams) > 1: - self.report_warning('Multiple streams found. Only the first stream will be downloaded.') - stream = streams[0] - - max_stream = max( - stream.get('videos') or [], - key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) - if max_stream is not None: - params = {arg.get('name'): arg.get('value') for arg in stream.get('keys', []) if arg.get('type') == 'param'} - formats = self._extract_formats_from_m3u8(max_stream.get('source'), params, video_id) or [] - - # For parts of the project MP4 files were archived - max_video = max( - traverse_obj(vod_data, ('videos', 'list', ...)), - key=lambda v: traverse_obj(v, ('bitrate', 'video'), default=0), default=None) - if max_video is not None: - video_url = self._WAYBACK_BASE_URL + max_video.get('source') - urlh = self._request_webpage(HEADRequest(video_url), video_id, errnote=False, - fatal=False, note='Check video availablity') - if urlh: - formats.append({'url': video_url}) - - return { - 'id': video_id, - 'formats': formats, - **traverse_obj(player_info, ('postDetail', 'post', { - 'title': ('officialVideo', 'title', {str}), - 'creator': ('author', 'nickname', {str}), - 'channel': ('channel', 'channelName', {str}), - 'channel_id': ('channel', 'channelCode', {str}), - 'duration': ('officialVideo', 'playTime', {int_or_none}), - 'view_count': ('officialVideo', 'playCount', {int_or_none}), - 'like_count': ('officialVideo', 'likeCount', {int_or_none}), - 'comment_count': ('officialVideo', 'commentCount', {int_or_none}), - 'timestamp': ('officialVideo', 'createdAt', {lambda x: int_or_none(x, scale=1000)}), - 'release_timestamp': ('officialVideo', 'willStartAt', {lambda x: int_or_none(x, scale=1000)}), - })), - **traverse_obj(vod_data, ('meta', { - 'uploader_id': ('user', 'id', {str}), - 'uploader': ('user', 'name', {str}), - 'uploader_url': ('user', 'url', {url_or_none}), - 'thumbnail': ('cover', 'source', {url_or_none}), - }), expected_type=lambda x: x or None), - **NaverBaseIE.process_subtitles(vod_data, lambda x: [self._WAYBACK_BASE_URL + x]), - } diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index d79caf5f3..2d8459b02 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -21,7 +21,7 @@ class NaverBaseIE(InfoExtractor): _CAPTION_EXT_RE = r'\.(?:ttml|vtt)' - @staticmethod # NB: Used in VLiveWebArchiveIE, WeverseIE + @staticmethod # NB: Used in WeverseIE def process_subtitles(vod_data, process_url): ret = {'subtitles': {}, 'automatic_captions': {}} for caption in traverse_obj(vod_data, ('captions', 'list', ...)): From 94389b225d9bcf29aa7ba8afaf1bbd7c62204eae Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 16 Sep 2023 21:42:42 -0500 Subject: [PATCH 156/218] [ie/RTVSLO] Fix format extraction (#8131) Closes #8020 Authored by: bashonly --- yt_dlp/extractor/rtvslo.py | 50 +++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/yt_dlp/extractor/rtvslo.py b/yt_dlp/extractor/rtvslo.py index 05942b6b4..39ace7cc6 100644 --- a/yt_dlp/extractor/rtvslo.py +++ b/yt_dlp/extractor/rtvslo.py @@ -1,6 +1,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, parse_duration, traverse_obj, unified_timestamp, @@ -25,7 +26,7 @@ class RTVSLOIE(InfoExtractor): 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', 'info_dict': { 'id': '174842550', - 'ext': 'flv', + 'ext': 'mp4', 'release_timestamp': 1643140032, 'upload_date': '20220125', 'series': 'Dnevnik', @@ -69,7 +70,21 @@ class RTVSLOIE(InfoExtractor): 'tbr': 128000, 'release_date': '20220201', }, - + }, { + 'url': 'https://365.rtvslo.si/arhiv/razred-zase/148350750', + 'info_dict': { + 'id': '148350750', + 'ext': 'mp4', + 'title': 'Prvi šolski dan, mozaična oddaja za mlade', + 'series': 'Razred zase', + 'series_id': '148185730', + 'duration': 1481, + 'upload_date': '20121019', + 'timestamp': 1350672122, + 'release_date': '20121019', + 'release_timestamp': 1350672122, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/148185730/razred_zase_2014_logo_4d_wide2.jpg', + }, }, { 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', 'only_matching': True @@ -98,13 +113,14 @@ def _real_extract(self, url): media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response'] formats = [] + skip_protocols = ['smil', 'f4m', 'dash'] adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none) if adaptive_url: - formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']) + formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols) adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none) if adaptive_url: - for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']): + for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=skip_protocols): formats.append({ **f, 'format_id': 'sign-' + f['format_id'], @@ -114,19 +130,19 @@ def _real_extract(self, url): else f.get('language')) }) - formats.extend( - { - 'url': f['streams'][strm], - 'ext': traverse_obj(f, 'mediaType', expected_type=str.lower), - 'width': f.get('width'), - 'height': f.get('height'), - 'tbr': f.get('bitrate'), - 'filesize': f.get('filesize'), - } - for strm in ('http', 'https') - for f in media.get('mediaFiles') or [] - if traverse_obj(f, ('streams', strm)) - ) + for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['https']))): + formats.append(traverse_obj(mediafile, { + 'url': ('streams', 'https'), + 'ext': ('mediaType', {str.lower}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'tbr': ('bitrate', {int_or_none}), + 'filesize': ('filesize', {int_or_none}), + })) + + for mediafile in traverse_obj(media, ('mediaFiles', lambda _, v: url_or_none(v['streams']['hls_sec']))): + formats.extend(self._extract_wowza_formats( + mediafile['streams']['hls_sec'], v_id, skip_protocols=skip_protocols)) if any('intermission.mp4' in x['url'] for x in formats): self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) From 836e06d246512f286f30c1371b2c54b72c9ecd93 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sun, 17 Sep 2023 12:56:50 +0200 Subject: [PATCH 157/218] [core] Fix support for upcoming Python 3.12 (#8130) This also adds the following test runners: - `3.12-dev` on `ubuntu-latest` - `3.12-dev` on `windows-latest` - `pypy-3.10` on `ubuntu-latest` Authored by: Grub4K --- .github/workflows/core.yml | 5 ++++- devscripts/update-version.py | 4 ++-- yt_dlp/YoutubeDL.py | 2 +- yt_dlp/extractor/aws.py | 2 +- yt_dlp/extractor/goplay.py | 4 ++-- yt_dlp/extractor/motherless.py | 2 +- yt_dlp/extractor/panopto.py | 4 ++-- yt_dlp/networking/_urllib.py | 2 +- yt_dlp/networking/exceptions.py | 2 +- yt_dlp/utils/_utils.py | 12 ++++++++---- 10 files changed, 23 insertions(+), 16 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index dead444c0..689408c50 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -13,13 +13,16 @@ jobs: matrix: os: [ubuntu-latest] # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', pypy-3.7, pypy-3.8] + python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows - os: windows-latest python-version: '3.7' run-tests-ext: bat + - os: windows-latest + python-version: '3.12-dev' + run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 run-tests-ext: bat diff --git a/devscripts/update-version.py b/devscripts/update-version.py index c873d10a5..0144bd284 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -10,14 +10,14 @@ import argparse import contextlib import sys -from datetime import datetime +from datetime import datetime, timezone from devscripts.utils import read_version, run_process, write_file def get_new_version(version, revision): if not version: - version = datetime.utcnow().strftime('%Y.%m.%d') + version = datetime.now(timezone.utc).strftime('%Y.%m.%d') if revision: assert revision.isdigit(), 'Revision must be a number' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 666d89b46..1feed3052 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -2591,7 +2591,7 @@ def _fill_common_fields(self, info_dict, final=True): # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) with contextlib.suppress(ValueError, OverflowError, OSError): - upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc) info_dict[date_key] = upload_date.strftime('%Y%m%d') live_keys = ('is_live', 'was_live') diff --git a/yt_dlp/extractor/aws.py b/yt_dlp/extractor/aws.py index eb831a153..c4741a6a1 100644 --- a/yt_dlp/extractor/aws.py +++ b/yt_dlp/extractor/aws.py @@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _aws_execute_api(self, aws_dict, video_id, query=None): query = query or {} - amz_date = datetime.datetime.utcnow().strftime('%Y%m%dT%H%M%SZ') + amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ') date = amz_date[:8] headers = { 'Accept': 'application/json', diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index 960d7d7bc..0a3c8340f 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -383,9 +383,9 @@ def __get_current_timestamp(): months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - time_now = datetime.datetime.utcnow() + time_now = datetime.datetime.now(datetime.timezone.utc) format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) - time_string = datetime.datetime.utcnow().strftime(format_string) + time_string = time_now.strftime(format_string) return time_string def __str__(self): diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index 769b52ce6..e359c44e9 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -151,7 +151,7 @@ def _real_extract(self, url): 'd': 'days', } kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} - upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') + upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index 6e3c9f442..5ab2b2bce 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -1,7 +1,7 @@ import calendar import json import functools -from datetime import datetime +from datetime import datetime, timezone from random import random from .common import InfoExtractor @@ -243,7 +243,7 @@ def _mark_watched(self, base_url, video_id, delivery_info): invocation_id = delivery_info.get('InvocationId') stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) if invocation_id and stream_id and duration: - timestamp_str = f'/Date({calendar.timegm(datetime.utcnow().timetuple())}000)/' + timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/' data = { 'streamRequests': [ { diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index b3e705b84..3c0647ecf 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -429,7 +429,7 @@ def _send(self, request): except urllib.error.HTTPError as e: if isinstance(e.fp, (http.client.HTTPResponse, urllib.response.addinfourl)): # Prevent file object from being closed when urllib.error.HTTPError is destroyed. - e._closer.file = None + e._closer.close_called = True raise HTTPError(UrllibResponseAdapter(e.fp), redirect_loop='redirect error' in str(e)) from e raise # unexpected except urllib.error.URLError as e: diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 10afc9ccb..465b18ba9 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -115,7 +115,7 @@ def __init__(self, http_error: HTTPError): hdrs=http_error.response.headers, fp=http_error.response ) - self._closer.file = None # Disable auto close + self._closer.close_called = True # Disable auto close self._http_error = http_error HTTPError.__init__(self, http_error.response, redirect_loop=http_error.redirect_loop) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 180bec245..ef26de116 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -669,6 +669,7 @@ def replace_insane(char): def sanitize_path(s, force=False): """Sanitizes and normalizes path on Windows""" + # XXX: this handles drive relative paths (c:sth) incorrectly if sys.platform == 'win32': force = False drive_or_unc, _ = os.path.splitdrive(s) @@ -687,7 +688,10 @@ def sanitize_path(s, force=False): sanitized_path.insert(0, drive_or_unc + os.path.sep) elif force and s and s[0] == os.path.sep: sanitized_path.insert(0, os.path.sep) - return os.path.join(*sanitized_path) + # TODO: Fix behavioral differences <3.12 + # The workaround using `normpath` only superficially passes tests + # Ref: https://github.com/python/cpython/pull/100351 + return os.path.normpath(os.path.join(*sanitized_path)) def sanitize_url(url, *, scheme='http'): @@ -1256,7 +1260,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if precision == 'auto': auto_precision = True precision = 'microsecond' - today = datetime_round(datetime.datetime.utcnow(), precision) + today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': @@ -1319,8 +1323,8 @@ def datetime_round(dt, precision='day'): 'second': 1, } roundto = lambda x, n: ((x + n / 2) // n) * n - timestamp = calendar.timegm(dt.timetuple()) - return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision])) + timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision]) + return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) def hyphenate_date(date_str): From 30ba233d4cee945756ed7344e7ddb3a90d2ae608 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Sun, 17 Sep 2023 13:22:04 +0200 Subject: [PATCH 158/218] [devscripts] `make_changelog`: Fix changelog grouping and add networking group (#8124) Authored by: Grub4K --- devscripts/changelog_override.json | 21 ++++++- devscripts/make_changelog.py | 96 ++++++++++++++++-------------- 2 files changed, 71 insertions(+), 46 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index d03db3f23..e7f453acf 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -68,6 +68,25 @@ { "action": "change", "when": "b03fa7834579a01cc5fba48c0e73488a16683d48", - "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b" + "short": "[ie/twitter] Revert 92315c03774cfabb3a921884326beb4b981f786b", + "authors": ["pukkandan"] + }, + { + "action": "change", + "when": "fcd6a76adc49d5cd8783985c7ce35384b72e545f", + "short": "[test] Add tests for socks proxies (#7908)", + "authors": ["coletdjnz"] + }, + { + "action": "change", + "when": "4bf912282a34b58b6b35d8f7e6be535770c89c76", + "short": "[rh:urllib] Remove dot segments during URL normalization (#7662)", + "authors": ["coletdjnz"] + }, + { + "action": "change", + "when": "59e92b1f1833440bb2190f847eb735cf0f90bc85", + "short": "[rh:urllib] Simplify gzip decoding (#7611)", + "authors": ["Grub4K"] } ] diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index 84f72d52f..ac68dcd19 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -31,35 +31,27 @@ class CommitGroup(enum.Enum): EXTRACTOR = 'Extractor' DOWNLOADER = 'Downloader' POSTPROCESSOR = 'Postprocessor' + NETWORKING = 'Networking' MISC = 'Misc.' - @classmethod - @property - def ignorable_prefixes(cls): - return ('core', 'downloader', 'extractor', 'misc', 'postprocessor', 'upstream') - @classmethod @lru_cache - def commit_lookup(cls): + def subgroup_lookup(cls): return { name: group for group, names in { - cls.PRIORITY: {'priority'}, cls.CORE: { 'aes', 'cache', 'compat_utils', 'compat', 'cookies', - 'core', 'dependencies', 'formats', 'jsinterp', - 'networking', 'outtmpl', 'plugins', 'update', - 'upstream', 'utils', }, cls.MISC: { @@ -67,23 +59,40 @@ def commit_lookup(cls): 'cleanup', 'devscripts', 'docs', - 'misc', 'test', }, - cls.EXTRACTOR: {'extractor', 'ie'}, - cls.DOWNLOADER: {'downloader', 'fd'}, - cls.POSTPROCESSOR: {'postprocessor', 'pp'}, + cls.NETWORKING: { + 'rh', + }, }.items() for name in names } @classmethod - def get(cls, value): - result = cls.commit_lookup().get(value) - if result: - logger.debug(f'Mapped {value!r} => {result.name}') + @lru_cache + def group_lookup(cls): + result = { + 'fd': cls.DOWNLOADER, + 'ie': cls.EXTRACTOR, + 'pp': cls.POSTPROCESSOR, + 'upstream': cls.CORE, + } + result.update({item.name.lower(): item for item in iter(cls)}) return result + @classmethod + def get(cls, value: str) -> tuple[CommitGroup | None, str | None]: + group, _, subgroup = (group.strip().lower() for group in value.partition('/')) + + result = cls.group_lookup().get(group) + if not result: + if subgroup: + return None, value + subgroup = group + result = cls.subgroup_lookup().get(subgroup) + + return result, subgroup or None + @dataclass class Commit: @@ -198,19 +207,23 @@ def _prepare_cleanup_misc_items(self, items): for commit_infos in cleanup_misc_items.values(): sorted_items.append(CommitInfo( 'cleanup', ('Miscellaneous',), ', '.join( - self._format_message_link(None, info.commit.hash).strip() + self._format_message_link(None, info.commit.hash) for info in sorted(commit_infos, key=lambda item: item.commit.hash or '')), [], Commit(None, '', commit_infos[0].commit.authors), [])) return sorted_items - def format_single_change(self, info): - message = self._format_message_link(info.message, info.commit.hash) + def format_single_change(self, info: CommitInfo): + message, sep, rest = info.message.partition('\n') + if '[' not in message: + # If the message doesn't already contain markdown links, try to add a link to the commit + message = self._format_message_link(message, info.commit.hash) + if info.issues: - message = message.replace('\n', f' ({self._format_issues(info.issues)})\n', 1) + message = f'{message} ({self._format_issues(info.issues)})' if info.commit.authors: - message = message.replace('\n', f' by {self._format_authors(info.commit.authors)}\n', 1) + message = f'{message} by {self._format_authors(info.commit.authors)}' if info.fixes: fix_message = ', '.join(f'{self._format_message_link(None, fix.hash)}' for fix in info.fixes) @@ -219,16 +232,14 @@ def format_single_change(self, info): if authors != info.commit.authors: fix_message = f'{fix_message} by {self._format_authors(authors)}' - message = message.replace('\n', f' (With fixes in {fix_message})\n', 1) + message = f'{message} (With fixes in {fix_message})' - return message[:-1] + return message if not sep else f'{message}{sep}{rest}' def _format_message_link(self, message, hash): assert message or hash, 'Improperly defined commit message or override' message = message if message else hash[:HASH_LENGTH] - if not hash: - return f'{message}\n' - return f'[{message}\n'.replace('\n', f']({self.repo_url}/commit/{hash})\n', 1) + return f'[{message}]({self.repo_url}/commit/{hash})' if hash else message def _format_issues(self, issues): return ', '.join(f'[#{issue}]({self.repo_url}/issues/{issue})' for issue in issues) @@ -318,7 +329,7 @@ def _get_commits_and_fixes(self, default_author): for commitish, revert_commit in reverts.items(): reverted = commits.pop(commitish, None) if reverted: - logger.debug(f'{commit} fully reverted {reverted}') + logger.debug(f'{commitish} fully reverted {reverted}') else: commits[revert_commit.hash] = revert_commit @@ -337,7 +348,7 @@ def apply_overrides(self, overrides): for override in overrides: when = override.get('when') if when and when not in self and when != self._start: - logger.debug(f'Ignored {when!r}, not in commits {self._start!r}') + logger.debug(f'Ignored {when!r} override') continue override_hash = override.get('hash') or when @@ -365,7 +376,7 @@ def groups(self): for commit in self: upstream_re = self.UPSTREAM_MERGE_RE.search(commit.short) if upstream_re: - commit.short = f'[core/upstream] Merged with youtube-dl {upstream_re.group(1)}' + commit.short = f'[upstream] Merged with youtube-dl {upstream_re.group(1)}' match = self.MESSAGE_RE.fullmatch(commit.short) if not match: @@ -410,25 +421,20 @@ def details_from_prefix(prefix): if not prefix: return CommitGroup.CORE, None, () - prefix, _, details = prefix.partition('/') - prefix = prefix.strip() - details = details.strip() + prefix, *sub_details = prefix.split(':') - group = CommitGroup.get(prefix.lower()) - if group is CommitGroup.PRIORITY: - prefix, _, details = details.partition('/') + group, details = CommitGroup.get(prefix) + if group is CommitGroup.PRIORITY and details: + details = details.partition('/')[2].strip() - if not details and prefix and prefix not in CommitGroup.ignorable_prefixes: - logger.debug(f'Replaced details with {prefix!r}') - details = prefix or None + if details and '/' in details: + logger.error(f'Prefix is overnested, using first part: {prefix}') + details = details.partition('/')[0].strip() if details == 'common': details = None - - if details: - details, *sub_details = details.split(':') - else: - sub_details = [] + elif group is CommitGroup.NETWORKING and details == 'rh': + details = 'Request Handler' return group, details, sub_details From 58493923e9b6f774947a2131e5258e9f3cf816be Mon Sep 17 00:00:00 2001 From: soundchaser128 <69268557+soundchaser128@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:09:42 +0200 Subject: [PATCH 159/218] [ie/rule34video] Extract tags (#7117) Authored by: soundchaser128 --- yt_dlp/extractor/rule34video.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/rule34video.py b/yt_dlp/extractor/rule34video.py index 9d15f4d21..f3250b557 100644 --- a/yt_dlp/extractor/rule34video.py +++ b/yt_dlp/extractor/rule34video.py @@ -1,6 +1,6 @@ import re -from ..utils import parse_duration +from ..utils import parse_duration, unescapeHTML from .common import InfoExtractor @@ -16,7 +16,8 @@ class Rule34VideoIE(InfoExtractor): 'title': 'Shot It-(mmd hmv)', 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065157/preview.jpg', 'duration': 347.0, - 'age_limit': 18 + 'age_limit': 18, + 'tags': 'count:14' } }, { @@ -28,7 +29,8 @@ class Rule34VideoIE(InfoExtractor): 'title': 'Lara in Trouble Ep. 7 [WildeerStudio]', 'thumbnail': 'https://rule34video.com/contents/videos_screenshots/3065000/3065296/preview.jpg', 'duration': 938.0, - 'age_limit': 18 + 'age_limit': 18, + 'tags': 'count:50' } }, ] @@ -57,5 +59,7 @@ def _real_extract(self, url): 'title': title, 'thumbnail': thumbnail, 'duration': parse_duration(duration), - 'age_limit': 18 + 'age_limit': 18, + 'tags': list(map(unescapeHTML, re.findall( + r'<a class="tag_item"[^>]+\bhref="https://rule34video\.com/tags/\d+/"[^>]*>(?P<tag>[^>]*)</a>', webpage))), } From efa2339502a37cf13ae7f143bd8b2c28f452d1cd Mon Sep 17 00:00:00 2001 From: Simon <simon30002021@icloud.com> Date: Sun, 17 Sep 2023 17:11:22 +0200 Subject: [PATCH 160/218] [ie/lecturio] Improve `_VALID_URL` (#7649) Authored by: simon300000 --- yt_dlp/extractor/lecturio.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/lecturio.py b/yt_dlp/extractor/lecturio.py index bb059d3a2..795012541 100644 --- a/yt_dlp/extractor/lecturio.py +++ b/yt_dlp/extractor/lecturio.py @@ -57,8 +57,8 @@ class LecturioIE(LecturioBaseIE): _VALID_URL = r'''(?x) https:// (?: - app\.lecturio\.com/([^/]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| - (?:www\.)?lecturio\.de/[^/]+/(?P<nt_de>[^/?#&]+)\.vortrag + app\.lecturio\.com/([^/?#]+/(?P<nt>[^/?#&]+)\.lecture|(?:\#/)?lecture/c/\d+/(?P<id>\d+))| + (?:www\.)?lecturio\.de/(?:[^/?#]+/)+(?P<nt_de>[^/?#&]+)\.vortrag ) ''' _TESTS = [{ @@ -73,6 +73,9 @@ class LecturioIE(LecturioBaseIE): }, { 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag', 'only_matching': True, + }, { + 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-at-1-staatsexamen/oeffentliches-recht-staatsexamen.vortrag', + 'only_matching': True, }, { 'url': 'https://app.lecturio.com/#/lecture/c/6434/39634', 'only_matching': True, From 63e0c5748c0eb461a2ccca4181616eb930b4b750 Mon Sep 17 00:00:00 2001 From: aky-01 <65510015+aky-01@users.noreply.github.com> Date: Sun, 17 Sep 2023 17:16:11 +0200 Subject: [PATCH 161/218] [ie/IndavideoEmbed] Fix extraction (#8129) Closes #7190 Authored by: aky-01 --- yt_dlp/extractor/indavideo.py | 73 +++++++++++++++++------------------ 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/yt_dlp/extractor/indavideo.py b/yt_dlp/extractor/indavideo.py index 4fa97d8bb..564bf8a02 100644 --- a/yt_dlp/extractor/indavideo.py +++ b/yt_dlp/extractor/indavideo.py @@ -1,9 +1,9 @@ from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, parse_age_limit, parse_iso8601, + time_seconds, update_url_query, ) @@ -11,15 +11,14 @@ class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] + # https://indavideo.hu/video/Vicces_cica_1 + # https://index.indavideo.hu/video/Hod_Nemetorszagban + # https://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # https://film.indavideo.hu/video/f_farkaslesen + # https://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ - 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', + 'url': 'https://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', 'info_dict': { 'id': '1837039', @@ -36,21 +35,33 @@ class IndavideoEmbedIE(InfoExtractor): 'tags': ['tánc', 'cica', 'cuki', 'cukiajanlo', 'newsroom'], }, }, { - 'url': 'http://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', - 'only_matching': True, - }, { - 'url': 'http://assets.indavideo.hu/swf/player.swf?v=fe25e500&vID=1bdc3c6d80&autostart=1&hide=1&i=1', + 'url': 'https://embed.indavideo.hu/player/video/1bdc3c6d80?autostart=1&hide=1', 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + 'url': 'https://indavideo.hu/video/Vicces_cica_1', + 'info_dict': { + 'id': '1335611', + 'ext': 'mp4', + 'title': 'Vicces cica', + 'description': 'Játszik a tablettel. :D', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Jet_Pack', + 'uploader_id': '491217', + 'timestamp': 1390821212, + 'upload_date': '20140127', + 'duration': 7, + 'age_limit': 0, + 'tags': ['cica', 'Jet_Pack'], + }, + }] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id, - video_id)['data'] - - title = video['title'] + f'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/{video_id}/', + video_id, query={'_': time_seconds()})['data'] video_urls = [] @@ -60,33 +71,21 @@ def _real_extract(self, url): elif isinstance(video_files, dict): video_urls.extend(video_files.values()) - video_file = video.get('video_file') - if video: - video_urls.append(video_file) video_urls = list(set(video_urls)) - video_prefix = video_urls[0].rsplit('/', 1)[0] - - for flv_file in video.get('flv_files', []): - flv_url = '%s/%s' % (video_prefix, flv_file) - if flv_url not in video_urls: - video_urls.append(flv_url) - - filesh = video.get('filesh') + filesh = video.get('filesh') or {} formats = [] for video_url in video_urls: height = int_or_none(self._search_regex( r'\.(\d{3,4})\.mp4(?:\?|$)', video_url, 'height', default=None)) - if filesh: - if not height: - continue - token = filesh.get(compat_str(height)) - if token is None: - continue - video_url = update_url_query(video_url, {'token': token}) + if not height and len(filesh) == 1: + height = int_or_none(list(filesh.keys())[0]) + token = filesh.get(str(height)) + if token is None: + continue formats.append({ - 'url': video_url, + 'url': update_url_query(video_url, {'token': token}), 'height': height, }) @@ -103,7 +102,7 @@ def _real_extract(self, url): return { 'id': video.get('id') or video_id, - 'title': title, + 'title': video.get('title'), 'description': video.get('description'), 'thumbnails': thumbnails, 'uploader': video.get('user_name'), From 81f46ac573dc443ad48560f308582a26784d3015 Mon Sep 17 00:00:00 2001 From: Sebastian Koch <sebastian@0py.de> Date: Sun, 17 Sep 2023 22:54:00 +0200 Subject: [PATCH 162/218] [ie/massengeschmack.tv] Fix title extraction (#7813) Authored by: sb0stn --- yt_dlp/extractor/massengeschmacktv.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/yt_dlp/extractor/massengeschmacktv.py b/yt_dlp/extractor/massengeschmacktv.py index 7dacb43e0..1490e9b21 100644 --- a/yt_dlp/extractor/massengeschmacktv.py +++ b/yt_dlp/extractor/massengeschmacktv.py @@ -17,11 +17,12 @@ class MassengeschmackTVIE(InfoExtractor): _TEST = { 'url': 'https://massengeschmack.tv/play/fktv202', - 'md5': 'a9e054db9c2b5a08f0a0527cc201e8d3', + 'md5': '9996f314994a49fefe5f39aa1b07ae21', 'info_dict': { 'id': 'fktv202', 'ext': 'mp4', - 'title': 'Fernsehkritik-TV - Folge 202', + 'title': 'Fernsehkritik-TV #202', + 'thumbnail': 'https://cache.massengeschmack.tv/img/mag/fktv202.jpg' }, } @@ -29,9 +30,6 @@ def _real_extract(self, url): episode = self._match_id(url) webpage = self._download_webpage(url, episode) - title = clean_html(self._html_search_regex( - '<h3>([^<]+)</h3>', webpage, 'title')) - thumbnail = self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False) sources = self._parse_json(self._search_regex(r'(?s)MEDIA\s*=\s*(\[.+?\]);', webpage, 'media'), episode, js_to_json) formats = [] @@ -67,7 +65,8 @@ def _real_extract(self, url): return { 'id': episode, - 'title': title, + 'title': clean_html(self._html_search_regex( + r'<span[^>]+\bid=["\']clip-title["\'][^>]*>([^<]+)', webpage, 'title', fatal=False)), 'formats': formats, - 'thumbnail': thumbnail, + 'thumbnail': self._search_regex(r'POSTER\s*=\s*"([^"]+)', webpage, 'thumbnail', fatal=False), } From 20fbbd9249a2f26c7ae579bde5ba5d69aa8fac69 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Mon, 18 Sep 2023 07:33:26 +0000 Subject: [PATCH 163/218] [networking] Fix various socks proxy bugs (#8065) - Fixed support for IPv6 socks proxies - Fixed support for IPv6 over socks5 - Fixed --source-address not being obeyed for socks4 and socks5 - Fixed socks4a when the destination address is an IPv4 address Closes https://github.com/yt-dlp/yt-dlp/issues/7959 Fixes https://github.com/ytdl-org/youtube-dl/issues/15368 Authored by: coletdjnz Co-authored-by: Simon Sawicki <accounts@grub4k.xyz> Co-authored-by: bashonly <bashonly@bashonly.com> --- test/test_socks.py | 38 +++++--------------- yt_dlp/networking/_helper.py | 57 ++++++++++++++++++++++++++++++ yt_dlp/networking/_urllib.py | 68 +++++++++++++----------------------- yt_dlp/socks.py | 31 +++++++++------- 4 files changed, 110 insertions(+), 84 deletions(-) diff --git a/test/test_socks.py b/test/test_socks.py index 95ffce275..211ee814d 100644 --- a/test/test_socks.py +++ b/test/test_socks.py @@ -281,17 +281,13 @@ def test_socks4_auth(self, handler, ctx): rh, proxies={'all': f'socks4://user:@{server_address}'}) assert response['version'] == 4 - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='socks4a implementation currently broken when destination is not a domain name')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_socks4a_ipv4_target(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: with handler(proxies={'all': f'socks4a://{server_address}'}) as rh: response = ctx.socks_info_request(rh, target_domain='127.0.0.1') assert response['version'] == 4 - assert response['ipv4_address'] == '127.0.0.1' - assert response['domain_address'] is None + assert (response['ipv4_address'] == '127.0.0.1') != (response['domain_address'] == '127.0.0.1') @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_socks4a_domain_target(self, handler, ctx): @@ -302,10 +298,7 @@ def test_socks4a_domain_target(self, handler, ctx): assert response['ipv4_address'] is None assert response['domain_address'] == 'localhost' - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='source_address is not yet supported for socks4 proxies')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' @@ -327,10 +320,7 @@ def test_socks4_errors(self, handler, ctx, reply_code): with pytest.raises(ProxyError): ctx.socks_info_request(rh) - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='IPv6 socks4 proxies are not yet supported')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv6_socks4_proxy(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks4://{server_address}'}) as rh: @@ -342,7 +332,7 @@ def test_ipv6_socks4_proxy(self, handler, ctx): @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_timeout(self, handler, ctx): with ctx.socks_server(Socks4ProxyHandler, sleep=2) as server_address: - with handler(proxies={'all': f'socks4://{server_address}'}, timeout=1) as rh: + with handler(proxies={'all': f'socks4://{server_address}'}, timeout=0.5) as rh: with pytest.raises(TransportError): ctx.socks_info_request(rh) @@ -383,7 +373,7 @@ def test_socks5_domain_target(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: response = ctx.socks_info_request(rh, target_domain='localhost') - assert response['ipv4_address'] == '127.0.0.1' + assert (response['ipv4_address'] == '127.0.0.1') != (response['ipv6_address'] == '::1') assert response['version'] == 5 @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) @@ -404,22 +394,15 @@ def test_socks5h_ip_target(self, handler, ctx): assert response['domain_address'] is None assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='IPv6 destination addresses are not yet supported')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_socks5_ipv6_destination(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: response = ctx.socks_info_request(rh, target_domain='[::1]') assert response['ipv6_address'] == '::1' - assert response['port'] == 80 assert response['version'] == 5 - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='IPv6 socks5 proxies are not yet supported')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv6_socks5_proxy(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler, bind_ip='::1') as server_address: with handler(proxies={'all': f'socks5://{server_address}'}) as rh: @@ -430,10 +413,7 @@ def test_ipv6_socks5_proxy(self, handler, ctx): # XXX: is there any feasible way of testing IPv6 source addresses? # Same would go for non-proxy source_address test... - @pytest.mark.parametrize('handler,ctx', [ - pytest.param('Urllib', 'http', marks=pytest.mark.xfail( - reason='source_address is not yet supported for socks5 proxies')) - ], indirect=True) + @pytest.mark.parametrize('handler,ctx', [('Urllib', 'http')], indirect=True) def test_ipv4_client_source_address(self, handler, ctx): with ctx.socks_server(Socks5ProxyHandler) as server_address: source_address = f'127.0.0.{random.randint(5, 255)}' diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index a43c57bb4..4c9dbf25d 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -2,6 +2,7 @@ import contextlib import functools +import socket import ssl import sys import typing @@ -206,3 +207,59 @@ def wrapper(self, *args, **kwargs): e.handler = self raise return wrapper + + +def _socket_connect(ip_addr, timeout, source_address): + af, socktype, proto, canonname, sa = ip_addr + sock = socket.socket(af, socktype, proto) + try: + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect(sa) + return sock + except socket.error: + sock.close() + raise + + +def create_connection( + address, + timeout=socket._GLOBAL_DEFAULT_TIMEOUT, + source_address=None, + *, + _create_socket_func=_socket_connect +): + # Work around socket.create_connection() which tries all addresses from getaddrinfo() including IPv6. + # This filters the addresses based on the given source_address. + # Based on: https://github.com/python/cpython/blob/main/Lib/socket.py#L810 + host, port = address + ip_addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + if not ip_addrs: + raise socket.error('getaddrinfo returns an empty list') + if source_address is not None: + af = socket.AF_INET if ':' not in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in ip_addrs if addr[0] == af] + if not ip_addrs: + raise OSError( + f'No remote IPv{4 if af == socket.AF_INET else 6} addresses available for connect. ' + f'Can\'t use "{source_address[0]}" as source address') + + err = None + for ip_addr in ip_addrs: + try: + sock = _create_socket_func(ip_addr, timeout, source_address) + # Explicitly break __traceback__ reference cycle + # https://bugs.python.org/issue36820 + err = None + return sock + except socket.error as e: + err = e + + try: + raise err + finally: + # Explicitly break __traceback__ reference cycle + # https://bugs.python.org/issue36820 + err = None diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index 3c0647ecf..c327f7744 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -23,6 +23,7 @@ from ._helper import ( InstanceStoreMixin, add_accept_encoding_header, + create_connection, get_redirect_method, make_socks_proxy_opts, select_proxy, @@ -54,44 +55,10 @@ def _create_http_connection(http_class, source_address, *args, **kwargs): hc = http_class(*args, **kwargs) + if hasattr(hc, '_create_connection'): + hc._create_connection = create_connection + if source_address is not None: - # This is to workaround _create_connection() from socket where it will try all - # address data from getaddrinfo() including IPv6. This filters the result from - # getaddrinfo() based on the source_address value. - # This is based on the cpython socket.create_connection() function. - # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 - def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): - host, port = address - err = None - addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) - af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 - ip_addrs = [addr for addr in addrs if addr[0] == af] - if addrs and not ip_addrs: - ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise OSError( - "No remote IP%s addresses available for connect, can't use '%s' as source address" - % (ip_version, source_address[0])) - for res in ip_addrs: - af, socktype, proto, canonname, sa = res - sock = None - try: - sock = socket.socket(af, socktype, proto) - if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: - sock.settimeout(timeout) - sock.bind(source_address) - sock.connect(sa) - err = None # Explicitly break reference cycle - return sock - except OSError as _: - err = _ - if sock is not None: - sock.close() - if err is not None: - raise err - else: - raise OSError('getaddrinfo returns an empty list') - if hasattr(hc, '_create_connection'): - hc._create_connection = _create_connection hc.source_address = (source_address, 0) return hc @@ -220,13 +187,28 @@ def make_socks_conn_class(base_class, socks_proxy): proxy_args = make_socks_proxy_opts(socks_proxy) class SocksConnection(base_class): - def connect(self): - self.sock = sockssocket() - self.sock.setproxy(**proxy_args) - if type(self.timeout) in (int, float): # noqa: E721 - self.sock.settimeout(self.timeout) - self.sock.connect((self.host, self.port)) + _create_connection = create_connection + def connect(self): + def sock_socket_connect(ip_addr, timeout, source_address): + af, socktype, proto, canonname, sa = ip_addr + sock = sockssocket(af, socktype, proto) + try: + connect_proxy_args = proxy_args.copy() + connect_proxy_args.update({'addr': sa[0], 'port': sa[1]}) + sock.setproxy(**connect_proxy_args) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: # noqa: E721 + sock.settimeout(timeout) + if source_address: + sock.bind(source_address) + sock.connect((self.host, self.port)) + return sock + except socket.error: + sock.close() + raise + self.sock = create_connection( + (proxy_args['addr'], proxy_args['port']), timeout=self.timeout, + source_address=self.source_address, _create_socket_func=sock_socket_connect) if isinstance(self, http.client.HTTPSConnection): self.sock = self._context.wrap_socket(self.sock, server_hostname=self.host) diff --git a/yt_dlp/socks.py b/yt_dlp/socks.py index f93328f63..e7f41d7e2 100644 --- a/yt_dlp/socks.py +++ b/yt_dlp/socks.py @@ -134,26 +134,31 @@ def _check_response_version(self, expected_version, got_version): self.close() raise InvalidVersionError(expected_version, got_version) - def _resolve_address(self, destaddr, default, use_remote_dns): - try: - return socket.inet_aton(destaddr) - except OSError: - if use_remote_dns and self._proxy.remote_dns: - return default - else: - return socket.inet_aton(socket.gethostbyname(destaddr)) + def _resolve_address(self, destaddr, default, use_remote_dns, family=None): + for f in (family,) if family else (socket.AF_INET, socket.AF_INET6): + try: + return f, socket.inet_pton(f, destaddr) + except OSError: + continue + + if use_remote_dns and self._proxy.remote_dns: + return 0, default + else: + res = socket.getaddrinfo(destaddr, None, family=family or 0) + f, _, _, _, ipaddr = res[0] + return f, socket.inet_pton(f, ipaddr[0]) def _setup_socks4(self, address, is_4a=False): destaddr, port = address - ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) + _, ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a, family=socket.AF_INET) packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr username = (self._proxy.username or '').encode() packet += username + b'\x00' - if is_4a and self._proxy.remote_dns: + if is_4a and self._proxy.remote_dns and ipaddr == SOCKS4_DEFAULT_DSTIP: packet += destaddr.encode() + b'\x00' self.sendall(packet) @@ -210,7 +215,7 @@ def _socks5_auth(self): def _setup_socks5(self, address): destaddr, port = address - ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) + family, ipaddr = self._resolve_address(destaddr, None, use_remote_dns=True) self._socks5_auth() @@ -220,8 +225,10 @@ def _setup_socks5(self, address): destaddr = destaddr.encode() packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME) packet += self._len_and_data(destaddr) - else: + elif family == socket.AF_INET: packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr + elif family == socket.AF_INET6: + packet += struct.pack('!B', Socks5AddressType.ATYP_IPV6) + ipaddr packet += struct.pack('!H', port) self.sendall(packet) From ba8e9eb2c8bbb699f314169fab8e544437ad731e Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Mon, 18 Sep 2023 15:08:40 -0600 Subject: [PATCH 164/218] [ie/radiofrance] Add support for livestreams, podcasts, playlists (#7006) Closes #4282 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 9 +- yt_dlp/extractor/radiofrance.py | 379 +++++++++++++++++++++++++++++++- 2 files changed, 382 insertions(+), 6 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index bf0c67542..ec3ae0e66 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1555,7 +1555,14 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE -from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiofrance import ( + FranceCultureIE, + RadioFranceIE, + RadioFranceLiveIE, + RadioFrancePodcastIE, + RadioFranceProfileIE, + RadioFranceProgramScheduleIE, +) from .radiozet import RadioZetPodcastIE from .radiokapital import ( RadioKapitalIE, diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 92e51b7f4..35f4b91dd 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -1,7 +1,18 @@ +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..utils import parse_duration, unified_strdate +from ..utils import ( + int_or_none, + join_nonempty, + js_to_json, + parse_duration, + strftime_or_none, + traverse_obj, + unified_strdate, + urljoin, +) class RadioFranceIE(InfoExtractor): @@ -56,8 +67,32 @@ def _real_extract(self, url): } -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])' +class RadioFranceBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?radiofrance\.fr' + + _STATIONS_RE = '|'.join(map(re.escape, ( + 'franceculture', + 'franceinfo', + 'franceinter', + 'francemusique', + 'fip', + 'mouv', + ))) + + def _extract_data_from_webpage(self, webpage, display_id, key): + return traverse_obj(self._search_json( + r'\bconst\s+data\s*=', webpage, key, display_id, + contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json), + (..., 'data', key, {dict}), get_all=False) or {} + + +class FranceCultureIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d{{6,}})(?:$|[?#]) + ''' + _TESTS = [ { 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', @@ -67,14 +102,30 @@ class FranceCultureIE(InfoExtractor): 'ext': 'mp3', 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', - 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'upload_date': '20220514', 'duration': 2750, }, }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9-30/le-7-9-30-du-vendredi-10-mars-2023-2107675', + 'info_dict': { + 'id': '2107675', + 'display_id': 'le-7-9-30-du-vendredi-10-mars-2023', + 'title': 'Inflation alimentaire : comment en sortir ? - Régis Debray et Claude Grange - Cybèle Idelot', + 'description': 'md5:36ee74351ede77a314fdebb94026b916', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'upload_date': '20230310', + 'duration': 8977, + 'ext': 'mp3', + }, + }, { 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-sciences/sante-bientot-un-vaccin-contre-l-asthme-allergique-3057200', + 'only_matching': True, } ] @@ -89,7 +140,6 @@ def _real_extract(self, url): 'id': video_id, 'display_id': display_id, 'url': video_data['contentUrl'], - 'ext': video_data.get('encodingFormat'), 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, 'duration': parse_duration(video_data.get('duration')), 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', @@ -102,3 +152,322 @@ def _real_extract(self, url): 'upload_date': unified_strdate(self._search_regex( r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) } + + +class RadioFranceLiveIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + https?://(?:www\.)?radiofrance\.fr + /(?P<id>{RadioFranceBaseIE._STATIONS_RE}) + /?(?P<substation_id>radio-[\w-]+)?(?:[#?]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/', + 'info_dict': { + 'id': 'franceinter', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/franceculture', + 'info_dict': { + 'id': 'franceculture', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-kids-family', + 'info_dict': { + 'id': 'mouv-radio-musique-kids-family', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-rnb-soul', + 'info_dict': { + 'id': 'mouv-radio-rnb-soul', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv/radio-musique-mix', + 'info_dict': { + 'id': 'mouv-radio-musique-mix', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/fip/radio-rock', + 'info_dict': { + 'id': 'fip-radio-rock', + 'title': str, + 'live_status': 'is_live', + 'ext': 'aac', + }, + 'params': { + 'skip_download': 'Livestream', + }, + }, { + 'url': 'https://www.radiofrance.fr/mouv', + 'only_matching': True, + }] + + def _real_extract(self, url): + station_id, substation_id = self._match_valid_url(url).group('id', 'substation_id') + + if substation_id: + webpage = self._download_webpage(url, station_id) + api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData') + else: + api_response = self._download_json( + f'https://www.radiofrance.fr/{station_id}/api/live', station_id) + + formats, subtitles = [], {} + for media_source in traverse_obj(api_response, (('now', None), 'media', 'sources', lambda _, v: v['url'])): + if media_source.get('format') == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(media_source['url'], station_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': media_source['url'], + 'abr': media_source.get('bitrate'), + }) + + return { + 'id': join_nonempty(station_id, substation_id), + 'title': traverse_obj(api_response, ('visual', 'legend')) or join_nonempty( + ('now', 'firstLine', 'title'), ('now', 'secondLine', 'title'), from_dict=api_response, delim=' - '), + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class RadioFrancePlaylistBase(RadioFranceBaseIE): + """Subclasses must set _METADATA_KEY""" + + def _call_api(self, content_id, cursor, page_num): + raise NotImplementedError('This method must be implemented by subclasses') + + def _generate_playlist_entries(self, content_id, content_response): + for page_num in itertools.count(2): + for entry in content_response['items']: + yield self.url_result( + f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, { + 'title': 'title', + 'description': 'standFirst', + 'timestamp': ('publishedDate', {int_or_none}), + 'thumbnail': ('visual', 'src'), + })) + + next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False) + if not next_cursor: + break + + content_response = self._call_api(content_id, next_cursor, page_num) + + def _real_extract(self, url): + display_id = self._match_id(url) + + metadata = self._download_json( + 'https://www.radiofrance.fr/api/v2.1/path', display_id, + query={'value': urllib.parse.urlparse(url).path})['content'] + + content_id = metadata['id'] + + return self.playlist_result( + self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id, + display_id=display_id, **{**traverse_obj(metadata, { + 'title': 'title', + 'description': 'standFirst', + 'thumbnail': ('visual', 'src'), + }), **traverse_obj(metadata, { + 'title': 'name', + 'description': 'role', + })}) + + +class RadioFrancePodcastIE(RadioFrancePlaylistBase): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?:{RadioFranceBaseIE._STATIONS_RE}) + /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$) + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinfo/podcasts/le-billet-vert', + 'info_dict': { + 'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17', + 'display_id': 'le-billet-vert', + 'title': 'Le billet sciences', + 'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale', + 'info_dict': { + 'id': '566fd524-3074-4fbc-ac69-8696f2152a54', + 'display_id': 'jean-marie-le-pen-l-obsession-nationale', + 'title': 'Jean-Marie Le Pen, l\'obsession nationale', + 'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_count': 7, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine', + 'info_dict': { + 'id': '63c1ddc9-9f15-457a-98b2-411bac63f48d', + 'display_id': 'serie-thomas-grjebine', + 'title': 'Thomas Grjebine', + }, + 'playlist_count': 1, + }, { + 'url': 'https://www.radiofrance.fr/fip/podcasts/certains-l-aiment-fip', + 'info_dict': { + 'id': '143dff38-e956-4a5d-8576-1c0b7242b99e', + 'display_id': 'certains-l-aiment-fip', + 'title': 'Certains l’aiment Fip', + 'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 321, + }, { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9', + 'only_matching': True, + }, { + 'url': 'https://www.radiofrance.fr/mouv/podcasts/dirty-mix', + 'only_matching': True, + }] + + _METADATA_KEY = 'expressions' + + def _call_api(self, podcast_id, cursor, page_num): + return self._download_json( + f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id, + note=f'Downloading page {page_num}', query={'pageCursor': cursor}) + + +class RadioFranceProfileIE(RadioFrancePlaylistBase): + _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3', + 'info_dict': { + 'id': '86c62790-e481-11e2-9f7b-782bcb6744eb', + 'display_id': 'thomas-pesquet', + 'title': 'Thomas Pesquet', + 'description': 'Astronaute à l\'agence spatiale européenne', + }, + 'playlist_mincount': 212, + }, { + 'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie', + 'info_dict': { + 'id': '9593050b-0183-4972-a0b5-d8f699079e02', + 'display_id': 'eugenie-bastie', + 'title': 'Eugénie Bastié', + 'description': 'Journaliste et essayiste', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + }, + 'playlist_mincount': 39, + }, { + 'url': 'https://www.radiofrance.fr/personnes/lea-salame', + 'only_matching': True, + }] + + _METADATA_KEY = 'documents' + + def _call_api(self, profile_id, cursor, page_num): + resp = self._download_json( + f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id, + note=f'Downloading page {page_num}', query={ + 'relation': 'personality', + 'cursor': cursor, + }) + + resp['next'] = traverse_obj(resp, ('pagination', 'next')) + return resp + + +class RadioFranceProgramScheduleIE(RadioFranceBaseIE): + _VALID_URL = rf'''(?x) + {RadioFranceBaseIE._VALID_URL_BASE} + /(?P<station>{RadioFranceBaseIE._STATIONS_RE}) + /grille-programmes(?:\?date=(?P<date>[\d-]+))? + ''' + + _TESTS = [{ + 'url': 'https://www.radiofrance.fr/franceinter/grille-programmes?date=17-02-2023', + 'info_dict': { + 'id': 'franceinter-program-20230217', + 'upload_date': '20230217', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023', + 'info_dict': { + 'id': 'franceculture-program-20230201', + 'upload_date': '20230201', + }, + 'playlist_count': 25, + }, { + 'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023', + 'info_dict': { + 'id': 'mouv-program-20230319', + 'upload_date': '20230319', + }, + 'playlist_count': 3, + }, { + 'url': 'https://www.radiofrance.fr/francemusique/grille-programmes?date=18-03-2023', + 'info_dict': { + 'id': 'francemusique-program-20230318', + 'upload_date': '20230318', + }, + 'playlist_count': 15, + }, { + 'url': 'https://www.radiofrance.fr/franceculture/grille-programmes', + 'only_matching': True, + }] + + def _generate_playlist_entries(self, webpage_url, api_response): + for entry in traverse_obj(api_response, ('steps', lambda _, v: v['expression']['path'])): + yield self.url_result( + urljoin(webpage_url, f'/{entry["expression"]["path"]}'), ie=FranceCultureIE, + url_transparent=True, **traverse_obj(entry, { + 'title': ('expression', 'title'), + 'thumbnail': ('expression', 'visual', 'src'), + 'timestamp': ('startTime', {int_or_none}), + 'series_id': ('concept', 'id'), + 'series': ('concept', 'title'), + })) + + def _real_extract(self, url): + station, date = self._match_valid_url(url).group('station', 'date') + webpage = self._download_webpage(url, station) + grid_data = self._extract_data_from_webpage(webpage, station, 'grid') + upload_date = strftime_or_none(grid_data.get('date'), '%Y%m%d') + + return self.playlist_result( + self._generate_playlist_entries(url, grid_data), + join_nonempty(station, 'program', upload_date), upload_date=upload_date) From 9e68747f9607f05e92bb7d9b6e79d678b50070e1 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:02:00 -0400 Subject: [PATCH 165/218] [ie/bilibili] Add support for series, favorites and watch later (#7518) Closes #6719 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 6 +- yt_dlp/extractor/bilibili.py | 281 ++++++++++++++++++++++++++++++-- 2 files changed, 272 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index ec3ae0e66..a6a286766 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -223,7 +223,11 @@ BiliBiliPlayerIE, BilibiliSpaceVideoIE, BilibiliSpaceAudioIE, - BilibiliSpacePlaylistIE, + BilibiliCollectionListIE, + BilibiliSeriesListIE, + BilibiliFavoritesListIE, + BilibiliWatchlaterIE, + BilibiliPlaylistIE, BiliIntlIE, BiliIntlSeriesIE, BiliLiveIE, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 290340078..5e7042dbb 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -15,6 +15,7 @@ GeoRestrictedError, InAdvancePagedList, OnDemandPagedList, + bool_or_none, filter_dict, float_or_none, format_field, @@ -35,6 +36,7 @@ unsmuggle_url, url_or_none, urlencode_postdata, + variadic, ) @@ -156,7 +158,7 @@ def _get_episodes_from_season(self, ss_id, url): class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -252,7 +254,7 @@ class BiliBiliIE(BilibiliBaseIE): 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', 'duration': 313.557, 'upload_date': '20220709', - 'uploader': '小夫Tech', + 'uploader': '小夫太渴', 'timestamp': 1657347907, 'uploader_id': '1326814124', 'comment_count': int, @@ -509,7 +511,7 @@ def _real_extract(self, url): class BiliBiliBangumiMediaIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/media/md(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', 'info_dict': { @@ -528,7 +530,7 @@ def _real_extract(self, url): class BiliBiliBangumiSeasonIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' + _VALID_URL = r'(?x)https?://(?:www\.)?bilibili\.com/bangumi/play/ss(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/play/ss26801', 'info_dict': { @@ -679,13 +681,35 @@ def get_entries(page_data): return self.playlist_result(paged_list, playlist_id) -class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): - _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' +class BilibiliSpaceListBaseIE(BilibiliSpaceBaseIE): + def _get_entries(self, page_data, bvid_keys, ending_key='bvid'): + for bvid in traverse_obj(page_data, (*variadic(bvid_keys, (str, bytes, dict, set)), ..., ending_key, {str})): + yield self.url_result(f'https://www.bilibili.com/video/{bvid}', BiliBiliIE, bvid) + + def _get_uploader(self, uid, playlist_id): + webpage = self._download_webpage(f'https://space.bilibili.com/{uid}', playlist_id, fatal=False) + return self._search_regex(r'(?s)<title\b[^>]*>([^<]+)的个人空间-', webpage, 'uploader', fatal=False) + + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + metadata, page_list = super()._extract_playlist(fetch_page, get_metadata, get_entries) + metadata.pop('page_count', None) + metadata.pop('page_size', None) + return metadata, page_list + + +class BilibiliCollectionListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail/?\?sid=(?P<sid>\d+)' _TESTS = [{ 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', 'info_dict': { 'id': '2142762_57445', - 'title': '《底特律 变人》' + 'title': '【完结】《底特律 变人》全结局流程解说', + 'description': '', + 'uploader': '老戴在此', + 'uploader_id': '2142762', + 'timestamp': int, + 'upload_date': str, + 'thumbnail': 'https://archive.biliimg.com/bfs/archive/e0e543ae35ad3df863ea7dea526bc32e70f4c091.jpg', }, 'playlist_mincount': 31, }] @@ -706,22 +730,251 @@ def get_metadata(page_data): return { 'page_count': math.ceil(entry_count / page_size), 'page_size': page_size, - 'title': traverse_obj(page_data, ('meta', 'name')) + 'uploader': self._get_uploader(mid, playlist_id), + **traverse_obj(page_data, { + 'title': ('meta', 'name', {str}), + 'description': ('meta', 'description', {str}), + 'uploader_id': ('meta', 'mid', {str_or_none}), + 'timestamp': ('meta', 'ptime', {int_or_none}), + 'thumbnail': ('meta', 'cover', {url_or_none}), + }) } def get_entries(page_data): - for entry in page_data.get('archives', []): - yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', - BiliBiliIE, entry['bvid']) + return self._get_entries(page_data, 'archives') metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) - return self.playlist_result(paged_list, playlist_id, metadata['title']) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliSeriesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<mid>\d+)/channel/seriesdetail/?\?\bsid=(?P<sid>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/1958703906/channel/seriesdetail?sid=547718&ctype=0', + 'info_dict': { + 'id': '1958703906_547718', + 'title': '直播回放', + 'description': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + 'modified_timestamp': int, + 'modified_date': str, + }, + 'playlist_mincount': 513, + }] + + def _real_extract(self, url): + mid, sid = self._match_valid_url(url).group('mid', 'sid') + playlist_id = f'{mid}_{sid}' + playlist_meta = traverse_obj(self._download_json( + f'https://api.bilibili.com/x/series/series?series_id={sid}', playlist_id, fatal=False + ), { + 'title': ('data', 'meta', 'name', {str}), + 'description': ('data', 'meta', 'description', {str}), + 'uploader_id': ('data', 'meta', 'mid', {str_or_none}), + 'timestamp': ('data', 'meta', 'ctime', {int_or_none}), + 'modified_timestamp': ('data', 'meta', 'mtime', {int_or_none}), + }) + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/series/archives', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': mid, 'series_id': sid, 'pn': page_idx + 1, 'ps': 30})['data'] + + def get_metadata(page_data): + page_size = page_data['page']['size'] + entry_count = page_data['page']['total'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + 'uploader': self._get_uploader(mid, playlist_id), + **playlist_meta + } + + def get_entries(page_data): + return self._get_entries(page_data, 'archives') + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id, **metadata) + + +class BilibiliFavoritesListIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:space\.bilibili\.com/\d+/favlist/?\?fid=|(?:www\.)?bilibili\.com/medialist/detail/ml)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/84912/favlist?fid=1103407912&ftype=create', + 'info_dict': { + 'id': '1103407912', + 'title': '【V2】(旧)', + 'description': '', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'modified_timestamp': int, + 'modified_date': str, + 'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg", + 'view_count': int, + 'like_count': int, + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/detail/ml1103407912', + 'only_matching': True, + }] + + def _real_extract(self, url): + fid = self._match_id(url) + + list_info = self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/list?media_id={fid}&pn=1&ps=20', + fid, note='Downloading favlist metadata') + if list_info['code'] == -403: + self.raise_login_required(msg='This is a private favorites list. You need to log in as its owner') + + entries = self._get_entries(self._download_json( + f'https://api.bilibili.com/x/v3/fav/resource/ids?media_id={fid}', + fid, note='Download favlist entries'), 'data') + + return self.playlist_result(entries, fid, **traverse_obj(list_info, ('data', 'info', { + 'title': ('title', {str}), + 'description': ('intro', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'modified_timestamp': ('mtime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + 'view_count': ('cnt_info', 'play', {int_or_none}), + 'like_count': ('cnt_info', 'thumb_up', {int_or_none}), + }))) + + +class BilibiliWatchlaterIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/watchlater/?(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/watchlater/#/list', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _real_extract(self, url): + list_id = getattr(self._get_cookies(url).get('DedeUserID'), 'value', 'watchlater') + watchlater_info = self._download_json( + 'https://api.bilibili.com/x/v2/history/toview/web?jsonp=jsonp', list_id) + if watchlater_info['code'] == -101: + self.raise_login_required(msg='You need to login to access your watchlater list') + entries = self._get_entries(watchlater_info, ('data', 'list')) + return self.playlist_result(entries, id=list_id, title='稍后再看') + + +class BilibiliPlaylistIE(BilibiliSpaceListBaseIE): + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/(?:medialist/play|list)/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/list/1958703906?sid=547718', + 'info_dict': { + 'id': '5_547718', + 'title': '直播回放', + 'uploader': '靡烟miya', + 'uploader_id': '1958703906', + 'timestamp': 1637985853, + 'upload_date': '20211127', + }, + 'playlist_mincount': 513, + }, { + 'url': 'https://www.bilibili.com/medialist/play/1958703906?business=space_series&business_id=547718&desc=1', + 'info_dict': { + 'id': '5_547718', + }, + 'playlist_mincount': 513, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + 'title': '【V2】(旧)', + 'uploader': '晓月春日', + 'uploader_id': '84912', + 'timestamp': 1604905176, + 'upload_date': '20201109', + 'thumbnail': r"re:http://i\d\.hdslb\.com/bfs/archive/14b83c62aa8871b79083df1e9ab4fbc699ad16fe\.jpg", + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.bilibili.com/medialist/play/ml1103407912', + 'info_dict': { + 'id': '3_1103407912', + }, + 'playlist_mincount': 22, + 'skip': 'redirect url', + }, { + 'url': 'https://www.bilibili.com/list/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }, { + 'url': 'https://www.bilibili.com/medialist/play/watchlater', + 'info_dict': {'id': 'watchlater'}, + 'playlist_mincount': 0, + 'skip': 'login required', + }] + + def _extract_medialist(self, query, list_id): + for page_num in itertools.count(1): + page_data = self._download_json( + 'https://api.bilibili.com/x/v2/medialist/resource/list', + list_id, query=query, note=f'getting playlist {query["biz_id"]} page {page_num}' + )['data'] + yield from self._get_entries(page_data, 'media_list', ending_key='bv_id') + query['oid'] = traverse_obj(page_data, ('media_list', -1, 'id')) + if not page_data.get('has_more', False): + break + + def _real_extract(self, url): + list_id = self._match_id(url) + webpage = self._download_webpage(url, list_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', list_id) + if traverse_obj(initial_state, ('error', 'code', {int_or_none})) != 200: + error_code = traverse_obj(initial_state, ('error', 'trueCode', {int_or_none})) + error_message = traverse_obj(initial_state, ('error', 'message', {str_or_none})) + if error_code == -400 and list_id == 'watchlater': + self.raise_login_required('You need to login to access your watchlater playlist') + elif error_code == -403: + self.raise_login_required('This is a private playlist. You need to login as its owner') + elif error_code == 11010: + raise ExtractorError('Playlist is no longer available', expected=True) + raise ExtractorError(f'Could not access playlist: {error_code} {error_message}') + + query = { + 'ps': 20, + 'with_current': False, + **traverse_obj(initial_state, { + 'type': ('playlist', 'type', {int_or_none}), + 'biz_id': ('playlist', 'id', {int_or_none}), + 'tid': ('tid', {int_or_none}), + 'sort_field': ('sortFiled', {int_or_none}), + 'desc': ('desc', {bool_or_none}, {str_or_none}, {str.lower}), + }) + } + metadata = { + 'id': f'{query["type"]}_{query["biz_id"]}', + **traverse_obj(initial_state, ('mediaListInfo', { + 'title': ('title', {str}), + 'uploader': ('upper', 'name', {str}), + 'uploader_id': ('upper', 'mid', {str_or_none}), + 'timestamp': ('ctime', {int_or_none}), + 'thumbnail': ('cover', {url_or_none}), + })), + } + return self.playlist_result(self._extract_medialist(query, list_id), **metadata) class BilibiliCategoryIE(InfoExtractor): IE_NAME = 'Bilibili category extractor' _MAX_RESULTS = 1000000 - _VALID_URL = r'https?://www\.bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/v/[a-zA-Z]+\/[a-zA-Z]+' _TESTS = [{ 'url': 'https://www.bilibili.com/v/kichiku/mad', 'info_dict': { @@ -1406,7 +1659,7 @@ def _real_extract(self, url): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' + _VALID_URL = r'https?://live\.bilibili\.com/(?:blanc/)?(?P<id>\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', From 69b03f84f8378b0b5a2fbae56f9b7d860b2f529e Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Mon, 18 Sep 2023 19:06:36 -0400 Subject: [PATCH 166/218] [ie/weibo] Fix extractor and support user extraction (#7657) Closes #3964, Closes #4673, Closes #6979 Authored by: c-basalt --- yt_dlp/extractor/_extractors.py | 3 +- yt_dlp/extractor/weibo.py | 319 +++++++++++++++++++++----------- 2 files changed, 215 insertions(+), 107 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index a6a286766..47d983c9c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2371,7 +2371,8 @@ ) from .weibo import ( WeiboIE, - WeiboMobileIE + WeiboVideoIE, + WeiboUserIE, ) from .weiqitv import WeiqiTVIE from .weverse import ( diff --git a/yt_dlp/extractor/weibo.py b/yt_dlp/extractor/weibo.py index bc9a71abe..b0c3052b6 100644 --- a/yt_dlp/extractor/weibo.py +++ b/yt_dlp/extractor/weibo.py @@ -1,134 +1,241 @@ -from .common import InfoExtractor - -import json import random -import re +import itertools +import urllib.parse -from ..compat import ( - compat_parse_qs, - compat_str, -) +from .common import InfoExtractor from ..utils import ( - js_to_json, + int_or_none, + make_archive_id, + mimetype2ext, + parse_resolution, + str_or_none, strip_jsonp, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, ) -class WeiboIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?weibo\.com/[0-9]+/(?P<id>[a-zA-Z0-9]+)' - _TEST = { - 'url': 'https://weibo.com/6275294458/Fp6RGfbff?type=comment', - 'info_dict': { - 'id': 'Fp6RGfbff', - 'ext': 'mp4', - 'title': 'You should have servants to massage you,... 来自Hosico_猫 - 微博', - } - } +class WeiboBaseIE(InfoExtractor): + def _update_visitor_cookies(self, video_id): + visitor_data = self._download_json( + 'https://passport.weibo.com/visitor/genvisitor', video_id, + note='Generating first-visit guest request', + transform_source=strip_jsonp, + data=urlencode_postdata({ + 'cb': 'gen_callback', + 'fp': '{"os":"2","browser":"Gecko57,0,0,0","fonts":"undefined","screenInfo":"1440*900*24","plugins":""}', + })) - def _real_extract(self, url): - video_id = self._match_id(url) - # to get Referer url for genvisitor - webpage, urlh = self._download_webpage_handle(url, video_id) - - visitor_url = urlh.url - - if 'passport.weibo.com' in visitor_url: - # first visit - visitor_data = self._download_json( - 'https://passport.weibo.com/visitor/genvisitor', video_id, - note='Generating first-visit data', - transform_source=strip_jsonp, - headers={'Referer': visitor_url}, - data=urlencode_postdata({ - 'cb': 'gen_callback', - 'fp': json.dumps({ - 'os': '2', - 'browser': 'Gecko57,0,0,0', - 'fonts': 'undefined', - 'screenInfo': '1440*900*24', - 'plugins': '', - }), - })) - - tid = visitor_data['data']['tid'] - cnfd = '%03d' % visitor_data['data']['confidence'] - - self._download_webpage( - 'https://passport.weibo.com/visitor/visitor', video_id, - note='Running first-visit callback', - query={ - 'a': 'incarnate', - 't': tid, - 'w': 2, - 'c': cnfd, - 'cb': 'cross_domain', - 'from': 'weibo', - '_rand': random.random(), - }) - - webpage = self._download_webpage( - url, video_id, note='Revisiting webpage') - - title = self._html_extract_title(webpage) - - video_formats = compat_parse_qs(self._search_regex( - r'video-sources=\\\"(.+?)\"', webpage, 'video_sources')) - - formats = [] - supported_resolutions = (480, 720) - for res in supported_resolutions: - vid_urls = video_formats.get(compat_str(res)) - if not vid_urls or not isinstance(vid_urls, list): - continue - - vid_url = vid_urls[0] - formats.append({ - 'url': vid_url, - 'height': res, + self._download_webpage( + 'https://passport.weibo.com/visitor/visitor', video_id, + note='Running first-visit callback to get guest cookies', + query={ + 'a': 'incarnate', + 't': visitor_data['data']['tid'], + 'w': 2, + 'c': '%03d' % visitor_data['data']['confidence'], + 'cb': 'cross_domain', + 'from': 'weibo', + '_rand': random.random(), }) - uploader = self._og_search_property( - 'nick-name', webpage, 'uploader', default=None) + def _weibo_download_json(self, url, video_id, *args, fatal=True, note='Downloading JSON metadata', **kwargs): + webpage, urlh = self._download_webpage_handle(url, video_id, *args, fatal=fatal, note=note, **kwargs) + if urllib.parse.urlparse(urlh.url).netloc == 'passport.weibo.com': + self._update_visitor_cookies(video_id) + webpage = self._download_webpage(url, video_id, *args, fatal=fatal, note=note, **kwargs) + return self._parse_json(webpage, video_id, fatal=fatal) + def _extract_formats(self, video_info): + media_info = traverse_obj(video_info, ('page_info', 'media_info')) + formats = traverse_obj(media_info, ( + 'playback_list', lambda _, v: url_or_none(v['play_info']['url']), 'play_info', { + 'url': 'url', + 'format': ('quality_desc', {str}), + 'format_id': ('label', {str}), + 'ext': ('mime', {mimetype2ext}), + 'tbr': ('bitrate', {int_or_none}, {lambda x: x or None}), + 'vcodec': ('video_codecs', {str}), + 'fps': ('fps', {int_or_none}), + 'width': ('width', {int_or_none}), + 'height': ('height', {int_or_none}), + 'filesize': ('size', {int_or_none}), + 'acodec': ('audio_codecs', {str}), + 'asr': ('audio_sample_rate', {int_or_none}), + 'audio_channels': ('audio_channels', {int_or_none}), + })) + if not formats: # fallback, should be barely used + for url in set(traverse_obj(media_info, (..., {url_or_none}))): + if 'label=' in url: # filter out non-video urls + format_id, resolution = self._search_regex( + r'label=(\w+)&template=(\d+x\d+)', url, 'format info', + group=(1, 2), default=(None, None)) + formats.append({ + 'url': url, + 'format_id': format_id, + **parse_resolution(resolution), + **traverse_obj(media_info, ( + 'video_details', lambda _, v: v['label'].startswith(format_id), { + 'size': ('size', {int_or_none}), + 'tbr': ('bitrate', {int_or_none}), + } + ), get_all=False), + }) + return formats + + def _parse_video_info(self, video_info, video_id=None): return { 'id': video_id, - 'title': title, - 'uploader': uploader, - 'formats': formats + 'extractor_key': WeiboIE.ie_key(), + 'extractor': WeiboIE.IE_NAME, + 'formats': self._extract_formats(video_info), + 'http_headers': {'Referer': 'https://weibo.com/'}, + '_old_archive_ids': [make_archive_id('WeiboMobile', video_id)], + **traverse_obj(video_info, { + 'id': (('id', 'id_str', 'mid'), {str_or_none}), + 'display_id': ('mblogid', {str_or_none}), + 'title': ('page_info', 'media_info', ('video_title', 'kol_title', 'name'), {str}, {lambda x: x or None}), + 'description': ('text_raw', {str}), + 'duration': ('page_info', 'media_info', 'duration', {int_or_none}), + 'timestamp': ('page_info', 'media_info', 'video_publish_time', {int_or_none}), + 'thumbnail': ('page_info', 'page_pic', {url_or_none}), + 'uploader': ('user', 'screen_name', {str}), + 'uploader_id': ('user', ('id', 'id_str'), {str_or_none}), + 'uploader_url': ('user', 'profile_url', {lambda x: urljoin('https://weibo.com/', x)}), + 'view_count': ('page_info', 'media_info', 'online_users_number', {int_or_none}), + 'like_count': ('attitudes_count', {int_or_none}), + 'repost_count': ('reposts_count', {int_or_none}), + }, get_all=False), + 'tags': traverse_obj(video_info, ('topic_struct', ..., 'topic_title', {str})) or None, } -class WeiboMobileIE(InfoExtractor): - _VALID_URL = r'https?://m\.weibo\.cn/status/(?P<id>[0-9]+)(\?.+)?' - _TEST = { - 'url': 'https://m.weibo.cn/status/4189191225395228?wm=3333_2001&sourcetype=weixin&featurecode=newtitle&from=singlemessage&isappinstalled=0', +class WeiboIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:m\.weibo\.cn/status|(?:www\.)?weibo\.com/\d+)/(?P<id>[a-zA-Z0-9]+)' + _TESTS = [{ + 'url': 'https://weibo.com/7827771738/N4xlMvjhI', + 'info_dict': { + 'id': '4910815147462302', + 'ext': 'mp4', + 'display_id': 'N4xlMvjhI', + 'title': '【睡前消息暑假版第一期:拉泰国一把 对中国有好处】', + 'description': 'md5:e2637a7673980d68694ea7c43cf12a5f', + 'duration': 918, + 'timestamp': 1686312819, + 'upload_date': '20230609', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '睡前视频基地', + 'uploader_id': '7827771738', + 'uploader_url': 'https://weibo.com/u/7827771738', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'tags': ['泰国大选远进党获胜', '睡前消息', '暑期版'], + }, + }, { + 'url': 'https://m.weibo.cn/status/4189191225395228', 'info_dict': { 'id': '4189191225395228', 'ext': 'mp4', - 'title': '午睡当然是要甜甜蜜蜜的啦', - 'uploader': '柴犬柴犬' + 'display_id': 'FBqgOmDxO', + 'title': '柴犬柴犬的秒拍视频', + 'description': 'md5:80f461ab5cdae6bbdb70efbf5a1db24f', + 'duration': 53, + 'timestamp': 1514264429, + 'upload_date': '20171226', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '柴犬柴犬', + 'uploader_id': '5926682210', + 'uploader_url': 'https://weibo.com/u/5926682210', + 'view_count': int, + 'like_count': int, + 'repost_count': int, } - } + }, { + 'url': 'https://weibo.com/0/4224132150961381', + 'note': 'no playback_list example', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - # to get Referer url for genvisitor - webpage = self._download_webpage(url, video_id, note='visit the page') - weibo_info = self._parse_json(self._search_regex( - r'var\s+\$render_data\s*=\s*\[({.*})\]\[0\]\s*\|\|\s*{};', - webpage, 'js_code', flags=re.DOTALL), - video_id, transform_source=js_to_json) + return self._parse_video_info(self._weibo_download_json( + f'https://weibo.com/ajax/statuses/show?id={video_id}', video_id)) - status_data = weibo_info.get('status', {}) - page_info = status_data.get('page_info') - title = status_data['status_title'] - uploader = status_data.get('user', {}).get('screen_name') - return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'url': page_info['media_info']['stream_url'] +class WeiboVideoIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:www\.)?weibo\.com/tv/show/(?P<id>\d+:\d+)' + _TESTS = [{ + 'url': 'https://weibo.com/tv/show/1034:4797699866951785?from=old_pc_videoshow', + 'info_dict': { + 'id': '4797700463137878', + 'ext': 'mp4', + 'display_id': 'LEZDodaiW', + 'title': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了', + 'description': '呃,稍微了解了一下靡烟miya,感觉这东西也太二了 http://t.cn/A6aerGsM ​​​', + 'duration': 76, + 'timestamp': 1659344278, + 'upload_date': '20220801', + 'thumbnail': r're:https://.*\.jpg', + 'uploader': '君子爱财陈平安', + 'uploader_id': '3905382233', + 'uploader_url': 'https://weibo.com/u/3905382233', + 'view_count': int, + 'like_count': int, + 'repost_count': int, } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + post_data = f'data={{"Component_Play_Playinfo":{{"oid":"{video_id}"}}}}'.encode() + video_info = self._weibo_download_json( + f'https://weibo.com/tv/api/component?page=%2Ftv%2Fshow%2F{video_id.replace(":", "%3A")}', + video_id, headers={'Referer': url}, data=post_data)['data']['Component_Play_Playinfo'] + return self.url_result(f'https://weibo.com/0/{video_info["mid"]}', WeiboIE) + + +class WeiboUserIE(WeiboBaseIE): + _VALID_URL = r'https?://(?:www\.)?weibo\.com/u/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://weibo.com/u/2066652961?tabtype=video', + 'info_dict': { + 'id': '2066652961', + 'title': '萧影殿下的视频', + 'description': '萧影殿下的全部视频', + 'uploader': '萧影殿下', + }, + 'playlist_mincount': 195, + }] + + def _fetch_page(self, uid, cursor=0, page=1): + return self._weibo_download_json( + 'https://weibo.com/ajax/profile/getWaterFallContent', + uid, note=f'Downloading videos page {page}', + query={'uid': uid, 'cursor': cursor})['data'] + + def _entries(self, uid, first_page): + cursor = 0 + for page in itertools.count(1): + response = first_page if page == 1 else self._fetch_page(uid, cursor, page) + for video_info in traverse_obj(response, ('list', ..., {dict})): + yield self._parse_video_info(video_info) + cursor = response.get('next_cursor') + if (int_or_none(cursor) or -1) < 0: + break + + def _real_extract(self, url): + uid = self._match_id(url) + first_page = self._fetch_page(uid) + uploader = traverse_obj(first_page, ('list', ..., 'user', 'screen_name', {str}), get_all=False) + metainfo = { + 'title': f'{uploader}的视频', + 'description': f'{uploader}的全部视频', + 'uploader': uploader, + } if uploader else {} + + return self.playlist_result(self._entries(uid, first_page), uid, **metainfo) From 8ac5b6d96ae5c60cd5ae2495949e0068a6754c45 Mon Sep 17 00:00:00 2001 From: u-spec-png <srdjankalaba@protonmail.ch> Date: Tue, 19 Sep 2023 01:36:10 +0200 Subject: [PATCH 167/218] [ie/N1Info:article] Fix extractor (#7373) Authored by: u-spec-png --- yt_dlp/extractor/n1.py | 52 +++++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index 55345f398..edc41443a 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -33,7 +33,7 @@ def _real_extract(self, url): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?:\w+\.)?n1info\.\w+|nova\.rs)/(?:[^/?#]+/){1,2}(?P<id>[^/?#]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -94,6 +94,16 @@ class N1InfoIIE(InfoExtractor): 'upload_date': '20211102', 'timestamp': 1635861677, }, + }, { + 'url': 'https://n1info.rs/vesti/cuta-biti-u-kosovskoj-mitrovici-znaci-da-te-docekaju-eksplozivnim-napravama/', + 'info_dict': { + 'id': '1332368', + 'ext': 'mp4', + 'title': 'Ćuta: Biti u Kosovskoj Mitrovici znači da te dočekaju eksplozivnim napravama', + 'upload_date': '20230620', + 'timestamp': 1687290536, + 'thumbnail': 'https://cdn.brid.tv/live/partners/26827/snapshot/1332368_th_6492013a8356f_1687290170.jpg' + }, }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, @@ -105,19 +115,35 @@ def _real_extract(self, url): title = self._html_search_regex(r'<h1[^>]+>(.+?)</h1>', webpage, 'title') timestamp = unified_timestamp(self._html_search_meta('article:published_time', webpage)) - - videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + plugin_data = self._html_search_meta('BridPlugin', webpage) entries = [] - for video in videos: - video_data = extract_attributes(video) - entries.append({ - '_type': 'url_transparent', - 'url': video_data.get('data-url'), - 'id': video_data.get('id'), - 'title': title, - 'thumbnail': video_data.get('data-thumbnail'), - 'timestamp': timestamp, - 'ie_key': 'N1InfoAsset'}) + if plugin_data: + site_id = self._html_search_regex(r'site:(\d+)', webpage, 'site id') + for video_data in re.findall(r'\$bp\("Brid_\d+", (.+)\);', webpage): + video_id = self._parse_json(video_data, title)['video'] + entries.append({ + 'id': video_id, + 'title': title, + 'timestamp': timestamp, + 'thumbnail': self._html_search_meta('thumbnailURL', webpage), + 'formats': self._extract_m3u8_formats( + f'https://cdn-uc.brid.tv/live/partners/{site_id}/streaming/{video_id}/{video_id}.m3u8', + video_id, fatal=False), + }) + else: + # Old player still present in older articles + videos = re.findall(r'(?m)(<video[^>]+>)', webpage) + for video in videos: + video_data = extract_attributes(video) + entries.append({ + '_type': 'url_transparent', + 'url': video_data.get('data-url'), + 'id': video_data.get('id'), + 'title': title, + 'thumbnail': video_data.get('data-thumbnail'), + 'timestamp': timestamp, + 'ie_key': 'N1InfoAsset', + }) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) for embedded_video in embedded_videos: From 40999467f72db074a3f13057da9bf82a857530fe Mon Sep 17 00:00:00 2001 From: niemands <67282402+niemands@users.noreply.github.com> Date: Tue, 19 Sep 2023 01:37:17 +0200 Subject: [PATCH 168/218] [ie/pornbox] Add extractor (#7386) Authored by: niemands --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/pornbox.py | 113 ++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 yt_dlp/extractor/pornbox.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 47d983c9c..dd670d59c 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1505,6 +1505,7 @@ from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE from .porn91 import Porn91IE +from .pornbox import PornboxIE from .porncom import PornComIE from .pornflip import PornFlipIE from .pornhd import PornHdIE diff --git a/yt_dlp/extractor/pornbox.py b/yt_dlp/extractor/pornbox.py new file mode 100644 index 000000000..c381382e9 --- /dev/null +++ b/yt_dlp/extractor/pornbox.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..compat import functools +from ..utils import ( + int_or_none, + parse_duration, + parse_iso8601, + qualities, + str_or_none, + traverse_obj, + url_or_none, +) + + +class PornboxIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pornbox\.com/application/watch-page/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://pornbox.com/application/watch-page/212108', + 'md5': '3ff6b6e206f263be4c5e987a3162ac6e', + 'info_dict': { + 'id': '212108', + 'ext': 'mp4', + 'title': 'md5:ececc5c6e6c9dd35d290c45fed05fd49', + 'uploader': 'Lily Strong', + 'timestamp': 1665871200, + 'upload_date': '20221015', + 'age_limit': 18, + 'availability': 'needs_auth', + 'duration': 1505, + 'cast': ['Lily Strong', 'John Strong'], + 'tags': 'count:11', + 'description': 'md5:589c7f33e183aa8aa939537300efb859', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$' + } + }, { + 'url': 'https://pornbox.com/application/watch-page/216045', + 'info_dict': { + 'id': '216045', + 'title': 'md5:3e48528e73a9a2b12f7a2772ed0b26a2', + 'description': 'md5:3e631dcaac029f15ed434e402d1b06c7', + 'uploader': 'VK Studio', + 'timestamp': 1618264800, + 'upload_date': '20210412', + 'age_limit': 18, + 'availability': 'premium_only', + 'duration': 2710, + 'cast': 'count:3', + 'tags': 'count:29', + 'thumbnail': r're:^https?://cdn-image\.gtflixtv\.com.*\.jpg.*$', + 'subtitles': 'count:6' + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True + }, + 'expected_warnings': [ + 'You are either not logged in or do not have access to this scene', + 'No video formats found', 'Requested format is not available'] + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + public_data = self._download_json(f'https://pornbox.com/contents/{video_id}', video_id) + + subtitles = {country_code: [{ + 'url': f'https://pornbox.com/contents/{video_id}/subtitles/{country_code}', + 'ext': 'srt' + }] for country_code in traverse_obj(public_data, ('subtitles', ..., {str}))} + + is_free_scene = traverse_obj( + public_data, ('price', 'is_available_for_free', {bool}), default=False) + + metadata = { + 'id': video_id, + **traverse_obj(public_data, { + 'title': ('scene_name', {str.strip}), + 'description': ('small_description', {str.strip}), + 'uploader': 'studio', + 'duration': ('runtime', {parse_duration}), + 'cast': (('models', 'male_models'), ..., 'model_name'), + 'thumbnail': ('player_poster', {url_or_none}), + 'tags': ('niches', ..., 'niche'), + }), + 'age_limit': 18, + 'timestamp': parse_iso8601(traverse_obj( + public_data, ('studios', 'release_date'), 'publish_date')), + 'availability': self._availability(needs_auth=True, needs_premium=not is_free_scene), + 'subtitles': subtitles, + } + + if not public_data.get('is_purchased') or not is_free_scene: + self.raise_login_required( + 'You are either not logged in or do not have access to this scene', metadata_available=True) + return metadata + + media_id = traverse_obj(public_data, ( + 'medias', lambda _, v: v['title'] == 'Full video', 'media_id', {int}), get_all=False) + if not media_id: + self.raise_no_formats('Could not find stream id', video_id=video_id) + + stream_data = self._download_json( + f'https://pornbox.com/media/{media_id}/stream', video_id=video_id, note='Getting manifest urls') + + get_quality = qualities(['web', 'vga', 'hd', '1080p', '4k', '8k']) + metadata['formats'] = traverse_obj(stream_data, ('qualities', lambda _, v: v['src'], { + 'url': 'src', + 'vbr': ('bitrate', {functools.partial(int_or_none, scale=1000)}), + 'format_id': ('quality', {str_or_none}), + 'quality': ('quality', {get_quality}), + 'width': ('size', {lambda x: int(x[:-1])}), + })) + + return metadata From cf11b40ac40e3d23a6352753296f3a732886efb9 Mon Sep 17 00:00:00 2001 From: Rohan Dey <142105763+Rohxn16@users.noreply.github.com> Date: Mon, 18 Sep 2023 23:39:20 +0000 Subject: [PATCH 169/218] [ie/media.ccc.de:lists] Fix extraction (#8144) Closes #8138 Authored by: Rohxn16 --- yt_dlp/extractor/ccc.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ccc.py b/yt_dlp/extractor/ccc.py index 22e3a22ec..ca6b82c98 100644 --- a/yt_dlp/extractor/ccc.py +++ b/yt_dlp/extractor/ccc.py @@ -90,10 +90,17 @@ class CCCPlaylistIE(InfoExtractor): 'id': '30c3', }, 'playlist_count': 135, + }, { + 'url': 'https://media.ccc.de/c/DS2023', + 'info_dict': { + 'title': 'Datenspuren 2023', + 'id': 'DS2023', + }, + 'playlist_count': 37 }] def _real_extract(self, url): - playlist_id = self._match_id(url).lower() + playlist_id = self._match_id(url) conf = self._download_json( 'https://media.ccc.de/public/conferences/' + playlist_id, From b532556d0a85e7d76f8f0880861232fb706ddbc5 Mon Sep 17 00:00:00 2001 From: Simon Sawicki <contact@grub4k.xyz> Date: Tue, 19 Sep 2023 21:52:44 +0200 Subject: [PATCH 170/218] [ie/pr0gramm] Rewrite extractor (#8151) Authored by: Grub4K --- yt_dlp/extractor/_extractors.py | 2 +- yt_dlp/extractor/pr0gramm.py | 218 ++++++++++++++++++++------------ 2 files changed, 139 insertions(+), 81 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index dd670d59c..490b010b8 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1524,7 +1524,7 @@ PuhuTVIE, PuhuTVSerieIE, ) -from .pr0gramm import Pr0grammStaticIE, Pr0grammIE +from .pr0gramm import Pr0grammIE from .prankcast import PrankCastIE from .premiershiprugby import PremiershipRugbyIE from .presstv import PressTVIE diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 2eb327fba..c8e0bb493 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,97 +1,155 @@ -import re +import json +from datetime import date +from urllib.parse import unquote from .common import InfoExtractor -from ..utils import merge_dicts +from ..compat import functools +from ..utils import ExtractorError, make_archive_id, urljoin +from ..utils.traversal import traverse_obj -class Pr0grammStaticIE(InfoExtractor): - # Possible urls: - # https://pr0gramm.com/static/5466437 - _VALID_URL = r'https?://pr0gramm\.com/static/(?P<id>[0-9]+)' - _TEST = { - 'url': 'https://pr0gramm.com/static/5466437', - 'md5': '52fa540d70d3edc286846f8ca85938aa', - 'info_dict': { - 'id': '5466437', - 'ext': 'mp4', - 'title': 'pr0gramm-5466437 by g11st', - 'uploader': 'g11st', - 'upload_date': '20221221', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # Fetch media sources - entries = self._parse_html5_media_entries(url, webpage, video_id) - media_info = entries[0] - - # Fetch author - uploader = self._html_search_regex(r'by\W+([\w-]+)\W+', webpage, 'uploader') - - # Fetch approx upload timestamp from filename - # Have None-defaults in case the extraction fails - uploadDay = None - uploadMon = None - uploadYear = None - uploadTimestr = None - # (//img.pr0gramm.com/2022/12/21/62ae8aa5e2da0ebf.mp4) - m = re.search(r'//img\.pr0gramm\.com/(?P<year>[\d]+)/(?P<mon>[\d]+)/(?P<day>[\d]+)/\w+\.\w{,4}', webpage) - - if (m): - # Up to a day of accuracy should suffice... - uploadDay = m.groupdict().get('day') - uploadMon = m.groupdict().get('mon') - uploadYear = m.groupdict().get('year') - uploadTimestr = uploadYear + uploadMon + uploadDay - - return merge_dicts({ - 'id': video_id, - 'title': 'pr0gramm-%s%s' % (video_id, (' by ' + uploader) if uploader else ''), - 'uploader': uploader, - 'upload_date': uploadTimestr - }, media_info) - - -# This extractor is for the primary url (used for sharing, and appears in the -# location bar) Since this page loads the DOM via JS, yt-dl can't find any -# video information here. So let's redirect to a compatibility version of -# the site, which does contain the <video>-element by itself, without requiring -# js to be ran. class Pr0grammIE(InfoExtractor): - # Possible urls: - # https://pr0gramm.com/new/546637 - # https://pr0gramm.com/new/video/546637 - # https://pr0gramm.com/top/546637 - # https://pr0gramm.com/top/video/546637 - # https://pr0gramm.com/user/g11st/uploads/5466437 - # https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290 - # https://pr0gramm.com/user/froschler/reinziehen-1elf/5232030 - # https://pr0gramm.com/user/froschler/1elf/5232030 - # https://pr0gramm.com/new/5495710:comment62621020 <- this is not the id! - # https://pr0gramm.com/top/fruher war alles damals/5498175 - - _VALID_URL = r'https?:\/\/pr0gramm\.com\/(?!static/\d+).+?\/(?P<id>[\d]+)(:|$)' - _TEST = { + _VALID_URL = r'https?://pr0gramm\.com\/(?:[^/?#]+/)+(?P<id>[\d]+)(?:[/?#:]|$)' + _TESTS = [{ + # Tags require account 'url': 'https://pr0gramm.com/new/video/5466437', 'info_dict': { 'id': '5466437', 'ext': 'mp4', 'title': 'pr0gramm-5466437 by g11st', + 'tags': ['Neon Genesis Evangelion', 'Touhou Project', 'Fly me to the Moon', 'Marisad', 'Marisa Kirisame', 'video', 'sound', 'Marisa', 'Anime'], 'uploader': 'g11st', + 'uploader_id': 394718, + 'upload_timestamp': 1671590240, 'upload_date': '20221221', - } - } + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + }, + }, { + # Tags require account + 'url': 'https://pr0gramm.com/new/3052805:comment28391322', + 'info_dict': { + 'id': '3052805', + 'ext': 'mp4', + 'title': 'pr0gramm-3052805 by Hansking1', + 'tags': 'count:15', + 'uploader': 'Hansking1', + 'uploader_id': 385563, + 'upload_timestamp': 1552930408, + 'upload_date': '20190318', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 0, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + }, + }, { + # Requires verified account + 'url': 'https://pr0gramm.com/new/Gianna%20Michaels/5848332', + 'info_dict': { + 'id': '5848332', + 'ext': 'mp4', + 'title': 'pr0gramm-5848332 by erd0pfel', + 'tags': 'count:18', + 'uploader': 'erd0pfel', + 'uploader_id': 349094, + 'upload_timestamp': 1694489652, + 'upload_date': '20230912', + 'like_count': int, + 'dislike_count': int, + 'age_limit': 18, + 'thumbnail': r're:^https://thumb\.pr0gramm\.com/.*\.jpg', + }, + }, { + 'url': 'https://pr0gramm.com/static/5466437', + 'only_matching': True, + }, { + 'url': 'https://pr0gramm.com/new/rowan%20atkinson%20herr%20bohne/3052805', + 'only_matching': True, + }, { + 'url': 'https://pr0gramm.com/user/froschler/dafur-ist-man-hier/5091290', + 'only_matching': True, + }] - def _generic_title(): - return "oof" + BASE_URL = 'https://pr0gramm.com' + + @functools.cached_property + def _is_logged_in(self): + return 'pp' in self._get_cookies(self.BASE_URL) + + @functools.cached_property + def _maximum_flags(self): + # We need to guess the flags for the content otherwise the api will raise an error + # We can guess the maximum allowed flags for the account from the cookies + # Bitflags are (msbf): nsfp, nsfl, nsfw, sfw + flags = 0b0001 + if self._is_logged_in: + flags |= 0b1000 + cookies = self._get_cookies(self.BASE_URL) + if 'me' not in cookies: + self._download_webpage(self.BASE_URL, None, 'Refreshing verification information') + if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')): + flags |= 0b0110 + + return flags + + def _call_api(self, endpoint, video_id, query={}, note='Downloading API json'): + data = self._download_json( + f'https://pr0gramm.com/api/items/{endpoint}', + video_id, note, query=query, expected_status=403) + + error = traverse_obj(data, ('error', {str})) + if error in ('nsfwRequired', 'nsflRequired', 'nsfpRequired', 'verificationRequired'): + if not self._is_logged_in: + self.raise_login_required() + raise ExtractorError(f'Unverified account cannot access NSFW/NSFL ({error})', expected=True) + elif error: + message = traverse_obj(data, ('msg', {str})) or error + raise ExtractorError(f'API returned error: {message}', expected=True) + + return data def _real_extract(self, url): video_id = self._match_id(url) + video_info = traverse_obj( + self._call_api('get', video_id, {'id': video_id, 'flags': self._maximum_flags}), + ('items', 0, {dict})) - return self.url_result( - 'https://pr0gramm.com/static/' + video_id, - video_id=video_id, - ie=Pr0grammStaticIE.ie_key()) + source = urljoin('https://img.pr0gramm.com', video_info.get('image')) + if not source or not source.endswith('mp4'): + self.raise_no_formats('Could not extract a video', expected=bool(source), video_id=video_id) + + tags = None + if self._is_logged_in: + metadata = self._call_api('info', video_id, {'itemId': video_id}) + tags = traverse_obj(metadata, ('tags', ..., 'tag', {str})) + # Sorted by "confidence", higher confidence = earlier in list + confidences = traverse_obj(metadata, ('tags', ..., 'confidence', ({int}, {float}))) + if confidences: + tags = [tag for _, tag in sorted(zip(confidences, tags), reverse=True)] + + return { + 'id': video_id, + 'title': f'pr0gramm-{video_id} by {video_info.get("user")}', + 'formats': [{ + 'url': source, + 'ext': 'mp4', + **traverse_obj(video_info, { + 'width': ('width', {int}), + 'height': ('height', {int}), + }), + }], + 'tags': tags, + 'age_limit': 18 if traverse_obj(video_info, ('flags', {0b110.__and__})) else 0, + '_old_archive_ids': [make_archive_id('Pr0grammStatic', video_id)], + **traverse_obj(video_info, { + 'uploader': ('user', {str}), + 'uploader_id': ('userId', {int}), + 'like_count': ('up', {int}), + 'dislike_count': ('down', {int}), + 'upload_timestamp': ('created', {int}), + 'upload_date': ('created', {int}, {date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}), + 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}) + }), + } From 9d6254069c75877bc88bc3584f4326fb1853a543 Mon Sep 17 00:00:00 2001 From: coletdjnz <coletdjnz@protonmail.com> Date: Wed, 20 Sep 2023 19:14:10 +0000 Subject: [PATCH 171/218] Update to ytdl-commit-66ab08 (#8128) [utils] Revert bbd3e7e, updating docstring, test instead https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9 Authored by: coletdjnz --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7b73f4fd..d94d8ea82 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ # NEW FEATURES -* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@42f2d4**](https://github.com/ytdl-org/youtube-dl/commit/07af47960f3bb262ead02490ce65c8c45c01741e) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) +* Forked from [**yt-dlc@f9401f2**](https://github.com/blackjack4494/yt-dlc/commit/f9401f2a91987068139c5f757b12fc711d4c0cee) and merged with [**youtube-dl@66ab08**](https://github.com/ytdl-org/youtube-dl/commit/66ab0814c4baa2dc79c2dd5287bc0ad61a37c5b9) ([exceptions](https://github.com/yt-dlp/yt-dlp/issues/21)) * **[SponsorBlock Integration](#sponsorblock-options)**: You can mark/remove sponsor sections in YouTube videos by utilizing the [SponsorBlock](https://sponsor.ajay.app) API From 35f9a306e6934793cff100200cd03f288ec33f11 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 21 Sep 2023 10:58:53 -0500 Subject: [PATCH 172/218] [dependencies] Handle deprecation of `sqlite3.version` (#8167) Closes #8152 Authored by: bashonly --- yt_dlp/compat/compat_utils.py | 2 +- yt_dlp/dependencies/__init__.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/yt_dlp/compat/compat_utils.py b/yt_dlp/compat/compat_utils.py index 3ca46d270..d62b7d048 100644 --- a/yt_dlp/compat/compat_utils.py +++ b/yt_dlp/compat/compat_utils.py @@ -15,7 +15,7 @@ def get_package_info(module): name=getattr(module, '_yt_dlp__identifier', module.__name__), version=str(next(filter(None, ( getattr(module, attr, None) - for attr in ('__version__', 'version_string', 'version') + for attr in ('_yt_dlp__version', '__version__', 'version_string', 'version') )), None))) diff --git a/yt_dlp/dependencies/__init__.py b/yt_dlp/dependencies/__init__.py index 6e7d29c5c..b56e4f5cc 100644 --- a/yt_dlp/dependencies/__init__.py +++ b/yt_dlp/dependencies/__init__.py @@ -43,6 +43,8 @@ try: import sqlite3 + # We need to get the underlying `sqlite` version, see https://github.com/yt-dlp/yt-dlp/issues/8152 + sqlite3._yt_dlp__version = sqlite3.sqlite_version except ImportError: # although sqlite3 is part of the standard library, it is possible to compile python without # sqlite support. See: https://github.com/yt-dlp/yt-dlp/issues/544 From 295fbb3ae3a7d0dd50e286be5c487cf145ed5778 Mon Sep 17 00:00:00 2001 From: Mozi <29089388+pzhlkj6612@users.noreply.github.com> Date: Fri, 22 Sep 2023 01:28:20 +0800 Subject: [PATCH 173/218] [ie/eplus:inbound] Add extractor (#5782) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/eplus.py | 96 +++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) create mode 100644 yt_dlp/extractor/eplus.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 490b010b8..3ce6baef2 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -565,6 +565,7 @@ EpiconIE, EpiconSeriesIE, ) +from .eplus import EplusIbIE from .epoch import EpochIE from .eporner import EpornerIE from .eroprofile import ( diff --git a/yt_dlp/extractor/eplus.py b/yt_dlp/extractor/eplus.py new file mode 100644 index 000000000..3ebdcf5fb --- /dev/null +++ b/yt_dlp/extractor/eplus.py @@ -0,0 +1,96 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_call, + unified_timestamp, +) + + +class EplusIbIE(InfoExtractor): + IE_NAME = 'eplus:inbound' + IE_DESC = 'e+ (イープラス) overseas' + _VALID_URL = r'https?://live\.eplus\.jp/ex/player\?ib=(?P<id>(?:\w|%2B|%2F){86}%3D%3D)' + _TESTS = [{ + 'url': 'https://live.eplus.jp/ex/player?ib=YEFxb3Vyc2Dombnjg7blkrLlrablnJLjgrnjgq%2Fjg7zjg6vjgqLjgqTjg4njg6vlkIzlpb3kvJpgTGllbGxhIQ%3D%3D', + 'info_dict': { + 'id': '354502-0001-002', + 'title': 'LoveLive!Series Presents COUNTDOWN LoveLive! 2021→2022~LIVE with a smile!~【Streaming+(配信)】', + 'live_status': 'was_live', + 'release_date': '20211231', + 'release_timestamp': 1640952000, + 'description': str, + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': [ + 'Could not find the playlist URL. This event may not be accessible', + 'No video formats found!', + 'Requested format is not available', + ], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + data_json = self._search_json(r'<script>\s*var app\s*=', webpage, 'data json', video_id) + + delivery_status = data_json.get('delivery_status') + archive_mode = data_json.get('archive_mode') + release_timestamp = try_call(lambda: unified_timestamp(data_json['event_datetime']) - 32400) + release_timestamp_str = data_json.get('event_datetime_text') # JST + + self.write_debug(f'delivery_status = {delivery_status}, archive_mode = {archive_mode}') + + if delivery_status == 'PREPARING': + live_status = 'is_upcoming' + elif delivery_status == 'STARTED': + live_status = 'is_live' + elif delivery_status == 'STOPPED': + if archive_mode != 'ON': + raise ExtractorError( + 'This event has ended and there is no archive for this event', expected=True) + live_status = 'post_live' + elif delivery_status == 'WAIT_CONFIRM_ARCHIVED': + live_status = 'post_live' + elif delivery_status == 'CONFIRMED_ARCHIVE': + live_status = 'was_live' + else: + self.report_warning(f'Unknown delivery_status {delivery_status}, treat it as a live') + live_status = 'is_live' + + formats = [] + + m3u8_playlist_urls = self._search_json( + r'var listChannels\s*=', webpage, 'hls URLs', video_id, contains_pattern=r'\[.+\]', default=[]) + if not m3u8_playlist_urls: + if live_status == 'is_upcoming': + self.raise_no_formats( + f'Could not find the playlist URL. This live event will begin at {release_timestamp_str} JST', expected=True) + else: + self.raise_no_formats( + 'Could not find the playlist URL. This event may not be accessible', expected=True) + elif live_status == 'is_upcoming': + self.raise_no_formats(f'This live event will begin at {release_timestamp_str} JST', expected=True) + elif live_status == 'post_live': + self.raise_no_formats('This event has ended, and the archive will be available shortly', expected=True) + else: + for m3u8_playlist_url in m3u8_playlist_urls: + formats.extend(self._extract_m3u8_formats(m3u8_playlist_url, video_id)) + # FIXME: HTTP request headers need to be updated to continue download + warning = 'Due to technical limitations, the download will be interrupted after one hour' + if live_status == 'is_live': + self.report_warning(warning) + elif live_status == 'was_live': + self.report_warning(f'{warning}. You can restart to continue the download') + + return { + 'id': data_json['app_id'], + 'title': data_json.get('app_name'), + 'formats': formats, + 'live_status': live_status, + 'description': data_json.get('content'), + 'release_timestamp': release_timestamp, + } From b3febedbeb662dfdf9b5c1d5799039ad4fc969de Mon Sep 17 00:00:00 2001 From: Elyse <26639800+elyse0@users.noreply.github.com> Date: Thu, 21 Sep 2023 11:30:32 -0600 Subject: [PATCH 174/218] [ie/Canal1,CaracolTvPlay] Add extractors (#7151) Closes #5826 Authored by: elyse0 --- yt_dlp/extractor/_extractors.py | 2 + yt_dlp/extractor/canal1.py | 39 +++++++++ yt_dlp/extractor/caracoltv.py | 136 ++++++++++++++++++++++++++++++++ yt_dlp/extractor/mediastream.py | 8 +- 4 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 yt_dlp/extractor/canal1.py create mode 100644 yt_dlp/extractor/caracoltv.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 3ce6baef2..632d6720e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -296,9 +296,11 @@ from .camsoda import CamsodaIE from .camtasia import CamtasiaEmbedIE from .camwithher import CamWithHerIE +from .canal1 import Canal1IE from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE +from .caracoltv import CaracolTvPlayIE from .carambatv import ( CarambaTVIE, CarambaTVPageIE, diff --git a/yt_dlp/extractor/canal1.py b/yt_dlp/extractor/canal1.py new file mode 100644 index 000000000..587a11ab8 --- /dev/null +++ b/yt_dlp/extractor/canal1.py @@ -0,0 +1,39 @@ +from .common import InfoExtractor + + +class Canal1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|noticias\.)?canal1\.com\.co/(?:[^?#&])+/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://canal1.com.co/noticias/napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco/', + 'info_dict': { + 'id': '63b39f6b354977084b85ab54', + 'display_id': 'napa-i-una-cadena-de-produccion-de-arroz-que-se-quedo-en-veremos-y-abandonada-en-el-departamento-del-choco', + 'title': 'Ñapa I Una cadena de producción de arroz que se quedó en veremos y abandonada en el departamento del Chocó', + 'description': 'md5:bc49c6d64d20610ea1e7daf079a0d013', + 'thumbnail': r're:^https?://[^?#]+63b39f6b354977084b85ab54', + 'ext': 'mp4', + }, + }, { + 'url': 'https://noticias.canal1.com.co/noticias/tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter/', + 'info_dict': { + 'id': '63b39e93f5fd223aa32250fb', + 'display_id': 'tres-i-el-triste-record-que-impuso-elon-musk-el-dueno-de-tesla-y-de-twitter', + 'title': 'Tres I El triste récord que impuso Elon Musk, el dueño de Tesla y de Twitter', + 'description': 'md5:d9f691f131a21ce6767ca6c05d17d791', + 'thumbnail': r're:^https?://[^?#]+63b39e93f5fd223aa32250fb', + 'ext': 'mp4', + }, + }, { + # Geo-restricted to Colombia + 'url': 'https://canal1.com.co/programas/guerreros-canal-1/video-inedito-guerreros-despedida-kewin-zarate/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self.url_result( + self._search_regex(r'"embedUrl"\s*:\s*"([^"]+)', webpage, 'embed url'), + display_id=display_id, url_transparent=True) diff --git a/yt_dlp/extractor/caracoltv.py b/yt_dlp/extractor/caracoltv.py new file mode 100644 index 000000000..79f7752fe --- /dev/null +++ b/yt_dlp/extractor/caracoltv.py @@ -0,0 +1,136 @@ +import base64 +import json +import uuid + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + traverse_obj, + urljoin, +) + + +class CaracolTvPlayIE(InfoExtractor): + _VALID_URL = r'https?://play\.caracoltv\.com/videoDetails/(?P<id>[^/?#]+)' + _NETRC_MACHINE = 'caracoltv-play' + + _TESTS = [{ + 'url': 'https://play.caracoltv.com/videoDetails/OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==', + 'info_dict': { + 'id': 'OTo4NGFmNjUwOWQ2ZmM0NTg2YWRiOWU0MGNhOWViOWJkYQ==', + 'title': 'La teoría del promedio', + 'description': 'md5:1cdd6d2c13f19ef0d9649ab81a023ac3', + }, + 'playlist_count': 6, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==/ella?season=0', + 'info_dict': { + 'id': 'OTo3OWM4ZTliYzQxMmM0MTMxYTk4Mjk2YjdjNGQ4NGRkOQ==', + 'title': 'Ella', + 'description': 'md5:a639b1feb5ddcc0cff92a489b4e544b8', + }, + 'playlist_count': 10, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==/la-vuelta-al-mundo-en-80-risas-2022?season=0', + 'info_dict': { + 'id': 'OTpiYTY1YTVmOTI5MzI0ZWJhOGZiY2Y3MmRlOWZlYmJkOA==', + 'title': 'La vuelta al mundo en 80 risas 2022', + 'description': 'md5:e97aac36106e5c37ebf947b3350106a4', + }, + 'playlist_count': 17, + }, { + 'url': 'https://play.caracoltv.com/videoDetails/MzoxX3BwbjRmNjB1', + 'only_matching': True, + }] + + _USER_TOKEN = None + + def _extract_app_token(self, webpage): + config_js_path = self._search_regex( + r'<script[^>]+src\s*=\s*"([^"]+coreConfig.js[^"]+)', webpage, 'config js url', fatal=False) + + mediation_config = {} if not config_js_path else self._search_json( + r'mediation\s*:', self._download_webpage( + urljoin('https://play.caracoltv.com/', config_js_path), None, fatal=False, note='Extracting JS config'), + 'mediation_config', None, transform_source=js_to_json, fatal=False) + + key = traverse_obj( + mediation_config, ('live', 'key')) or '795cd9c089a1fc48094524a5eba85a3fca1331817c802f601735907c8bbb4f50' + secret = traverse_obj( + mediation_config, ('live', 'secret')) or '64dec00a6989ba83d087621465b5e5d38bdac22033b0613b659c442c78976fa0' + + return base64.b64encode(f'{key}:{secret}'.encode()).decode() + + def _perform_login(self, email, password): + webpage = self._download_webpage('https://play.caracoltv.com/', None, fatal=False) + app_token = self._extract_app_token(webpage) + + bearer_token = self._download_json( + 'https://eu-gateway.inmobly.com/applications/oauth', None, data=b'', note='Retrieving bearer token', + headers={'Authorization': f'Basic {app_token}'})['token'] + + self._USER_TOKEN = self._download_json( + 'https://eu-gateway.inmobly.com/user/login', None, note='Performing login', headers={ + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {bearer_token}', + }, data=json.dumps({ + 'device_data': { + 'device_id': str(uuid.uuid4()), + 'device_token': '', + 'device_type': 'web' + }, + 'login_data': { + 'enabled': True, + 'email': email, + 'password': password, + } + }).encode())['user_token'] + + def _extract_video(self, video_data, series_id=None, season_id=None, season_number=None): + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_data['stream_url'], series_id, 'mp4') + + return { + 'id': video_data['id'], + 'title': video_data.get('name'), + 'description': video_data.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': traverse_obj( + video_data, ('extra_thumbs', ..., {'url': 'thumb_url', 'height': 'height', 'width': 'width'})), + 'series_id': series_id, + 'season_id': season_id, + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(video_data.get('item_order')), + 'is_live': video_data.get('entry_type') == 3, + } + + def _extract_series_seasons(self, seasons, series_id): + for season in seasons: + api_response = self._download_json( + 'https://eu-gateway.inmobly.com/feed', series_id, query={'season_id': season['id']}, + headers={'Authorization': f'Bearer {self._USER_TOKEN}'}) + + season_number = season.get('order') + for episode in api_response['items']: + yield self._extract_video(episode, series_id, season['id'], season_number) + + def _real_extract(self, url): + series_id = self._match_id(url) + + if self._USER_TOKEN is None: + self._perform_login('guest@inmobly.com', 'Test@gus1') + + api_response = self._download_json( + 'https://eu-gateway.inmobly.com/feed', series_id, query={'include_ids': series_id}, + headers={'Authorization': f'Bearer {self._USER_TOKEN}'})['items'][0] + + if not api_response.get('seasons'): + return self._extract_video(api_response) + + return self.playlist_result( + self._extract_series_seasons(api_response['seasons'], series_id), + series_id, **traverse_obj(api_response, { + 'title': 'name', + 'description': 'description', + })) diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index d5c9aab8a..b8cb5a691 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -106,8 +106,12 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - if 'Debido a tu ubicación no puedes ver el contenido' in webpage: - self.raise_geo_restricted() + for message in [ + 'Debido a tu ubicación no puedes ver el contenido', + 'You are not allowed to watch this video: Geo Fencing Restriction' + ]: + if message in webpage: + self.raise_geo_restricted() player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id) From 21f40e75dfc0055ea9cdbd7fe2c46c6f9b561afd Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Thu, 21 Sep 2023 13:34:35 -0400 Subject: [PATCH 175/218] [ie/douyutv] Fix extractors (#7652) Closes #2494, Closes #7295 Authored by: c-basalt --- yt_dlp/extractor/douyutv.py | 273 ++++++++++++++++++++++++------------ 1 file changed, 184 insertions(+), 89 deletions(-) diff --git a/yt_dlp/extractor/douyutv.py b/yt_dlp/extractor/douyutv.py index fa40844df..ee8893d5a 100644 --- a/yt_dlp/extractor/douyutv.py +++ b/yt_dlp/extractor/douyutv.py @@ -1,31 +1,72 @@ import time import hashlib -import re import urllib +import uuid from .common import InfoExtractor +from .openload import PhantomJSwrapper from ..utils import ( ExtractorError, + UserNotLive, + determine_ext, + int_or_none, + js_to_json, + parse_resolution, + str_or_none, + traverse_obj, unescapeHTML, - unified_strdate, + url_or_none, + urlencode_postdata, urljoin, ) -class DouyuTVIE(InfoExtractor): - IE_DESC = '斗鱼' +class DouyuBaseIE(InfoExtractor): + def _download_cryptojs_md5(self, video_id): + for url in [ + 'https://cdnjs.cloudflare.com/ajax/libs/crypto-js/3.1.2/rollups/md5.js', + 'https://cdn.bootcdn.net/ajax/libs/crypto-js/3.1.2/rollups/md5.js', + ]: + js_code = self._download_webpage( + url, video_id, note='Downloading signing dependency', fatal=False) + if js_code: + self.cache.store('douyu', 'crypto-js-md5', js_code) + return js_code + raise ExtractorError('Unable to download JS dependency (crypto-js/md5)') + + def _get_cryptojs_md5(self, video_id): + return self.cache.load('douyu', 'crypto-js-md5') or self._download_cryptojs_md5(video_id) + + def _calc_sign(self, sign_func, video_id, a): + b = uuid.uuid4().hex + c = round(time.time()) + js_script = f'{self._get_cryptojs_md5(video_id)};{sign_func};console.log(ub98484234("{a}","{b}","{c}"))' + phantom = PhantomJSwrapper(self) + result = phantom.execute(js_script, video_id, + note='Executing JS signing script').strip() + return {i: v[0] for i, v in urllib.parse.parse_qs(result).items()} + + def _search_js_sign_func(self, webpage, fatal=True): + # The greedy look-behind ensures last possible script tag is matched + return self._search_regex( + r'(?:<script.*)?<script[^>]*>(.*?ub98484234.*?)</script>', webpage, 'JS sign func', fatal=fatal) + + +class DouyuTVIE(DouyuBaseIE): + IE_DESC = '斗鱼直播' _VALID_URL = r'https?://(?:www\.)?douyu(?:tv)?\.com/(topic/\w+\?rid=|(?:[^/]+/))*(?P<id>[A-Za-z0-9]+)' _TESTS = [{ - 'url': 'http://www.douyutv.com/iseven', + 'url': 'https://www.douyu.com/pigff', 'info_dict': { - 'id': '17732', - 'display_id': 'iseven', - 'ext': 'flv', - 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': r're:.*m7show@163\.com.*', - 'thumbnail': r're:^https?://.*\.png', - 'uploader': '7师傅', + 'id': '24422', + 'display_id': 'pigff', + 'ext': 'mp4', + 'title': 're:^【PIGFF】.* [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': r'≥15级牌子看鱼吧置顶帖进粉丝vx群', + 'thumbnail': str, + 'uploader': 'pigff', 'is_live': True, + 'live_status': 'is_live', }, 'params': { 'skip_download': True, @@ -85,15 +126,43 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] + def _get_sign_func(self, room_id, video_id): + return self._download_json( + f'https://www.douyu.com/swf_api/homeH5Enc?rids={room_id}', video_id, + note='Getting signing script')['data'][f'room{room_id}'] + + def _extract_stream_formats(self, stream_formats): + formats = [] + for stream_info in traverse_obj(stream_formats, (..., 'data')): + stream_url = urljoin( + traverse_obj(stream_info, 'rtmp_url'), traverse_obj(stream_info, 'rtmp_live')) + if stream_url: + rate_id = traverse_obj(stream_info, ('rate', {int_or_none})) + rate_info = traverse_obj(stream_info, ('multirates', lambda _, v: v['rate'] == rate_id), get_all=False) + ext = determine_ext(stream_url) + formats.append({ + 'url': stream_url, + 'format_id': str_or_none(rate_id), + 'ext': 'mp4' if ext == 'm3u8' else ext, + 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', + 'quality': rate_id % -10000 if rate_id is not None else None, + **traverse_obj(rate_info, { + 'format': ('name', {str_or_none}), + 'tbr': ('bit', {int_or_none}), + }), + }) + return formats + def _real_extract(self, url): video_id = self._match_id(url) - if video_id.isdigit(): - room_id = video_id - else: - page = self._download_webpage(url, video_id) - room_id = self._html_search_regex( - r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + webpage = self._download_webpage(url, video_id) + room_id = self._search_regex(r'\$ROOM\.room_id\s*=\s*(\d+)', webpage, 'room id') + + if self._search_regex(r'"videoLoop"\s*:\s*(\d+)', webpage, 'loop', default='') == '1': + raise UserNotLive('The channel is auto-playing VODs', video_id=video_id) + if self._search_regex(r'\$ROOM\.show_status\s*=\s*(\d+)', webpage, 'status', default='') == '2': + raise UserNotLive(video_id=video_id) # Grab metadata from API params = { @@ -102,110 +171,136 @@ def _real_extract(self, url): 'time': int(time.time()), } params['auth'] = hashlib.md5( - f'room/{video_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() - room = self._download_json( + f'room/{room_id}?{urllib.parse.urlencode(params)}zNzMV1y4EMxOHS6I5WKm'.encode()).hexdigest() + room = traverse_obj(self._download_json( f'http://www.douyutv.com/api/v1/room/{room_id}', video_id, - note='Downloading room info', query=params)['data'] + note='Downloading room info', query=params, fatal=False), 'data') # 1 = live, 2 = offline - if room.get('show_status') == '2': - raise ExtractorError('Live stream is offline', expected=True) + if traverse_obj(room, 'show_status') == '2': + raise UserNotLive(video_id=video_id) - video_url = urljoin('https://hls3-akm.douyucdn.cn/', self._search_regex(r'(live/.*)', room['hls_url'], 'URL')) - formats, subs = self._extract_m3u8_formats_and_subtitles(video_url, room_id) + js_sign_func = self._search_js_sign_func(webpage, fatal=False) or self._get_sign_func(room_id, video_id) + form_data = { + 'rate': 0, + **self._calc_sign(js_sign_func, video_id, room_id), + } + stream_formats = [self._download_json( + f'https://www.douyu.com/lapi/live/getH5Play/{room_id}', + video_id, note="Downloading livestream format", + data=urlencode_postdata(form_data))] - title = unescapeHTML(room['room_name']) - description = room.get('show_details') - thumbnail = room.get('room_src') - uploader = room.get('nickname') + for rate_id in traverse_obj(stream_formats[0], ('data', 'multirates', ..., 'rate')): + if rate_id != traverse_obj(stream_formats[0], ('data', 'rate')): + form_data['rate'] = rate_id + stream_formats.append(self._download_json( + f'https://www.douyu.com/lapi/live/getH5Play/{room_id}', + video_id, note=f'Downloading livestream format {rate_id}', + data=urlencode_postdata(form_data))) return { 'id': room_id, - 'display_id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, + 'formats': self._extract_stream_formats(stream_formats), 'is_live': True, - 'subtitles': subs, - 'formats': formats, + **traverse_obj(room, { + 'display_id': ('url', {str}, {lambda i: i[1:]}), + 'title': ('room_name', {unescapeHTML}), + 'description': ('show_details', {str}), + 'uploader': ('nickname', {str}), + 'thumbnail': ('room_src', {url_or_none}), + }) } -class DouyuShowIE(InfoExtractor): +class DouyuShowIE(DouyuBaseIE): _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ - 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', - 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'url': 'https://v.douyu.com/show/mPyq7oVNe5Yv1gLY', 'info_dict': { - 'id': 'rjNBdvnVXNzvE2yw', + 'id': 'mPyq7oVNe5Yv1gLY', 'ext': 'mp4', - 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', - 'duration': 7150.08, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': '陈一发儿', - 'uploader_id': 'XrZwYelr5wbK', - 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', - 'upload_date': '20170402', + 'title': '四川人小时候的味道“蒜苗回锅肉”,传统菜不能丢,要常做来吃', + 'duration': 633, + 'thumbnail': str, + 'uploader': '美食作家王刚V', + 'uploader_id': 'OVAO4NVx1m7Q', + 'timestamp': 1661850002, + 'upload_date': '20220830', + 'view_count': int, + 'tags': ['美食', '美食综合'], }, }, { 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', 'only_matching': True, }] + _FORMATS = { + 'super': '原画', + 'high': '超清', + 'normal': '高清', + } + + _QUALITIES = { + 'super': -1, + 'high': -2, + 'normal': -3, + } + + _RESOLUTIONS = { + 'super': '1920x1080', + 'high': '1280x720', + 'normal': '852x480', + } + def _real_extract(self, url): url = url.replace('vmobile.', 'v.') video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - room_info = self._parse_json(self._search_regex( - r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + video_info = self._search_json( + r'<script>\s*window\.\$DATA\s*=', webpage, + 'video info', video_id, transform_source=js_to_json) - video_info = None + js_sign_func = self._search_js_sign_func(webpage) + form_data = { + 'vid': video_id, + **self._calc_sign(js_sign_func, video_id, video_info['ROOM']['point_id']), + } + url_info = self._download_json( + 'https://v.douyu.com/api/stream/getStreamUrl', video_id, + data=urlencode_postdata(form_data), note="Downloading video formats") - for trial in range(5): - # Sometimes Douyu rejects our request. Let's try it more times - try: - video_info = self._download_json( - 'https://vmobile.douyu.com/video/getInfo', video_id, - query={'vid': video_id}, - headers={ - 'Referer': url, - 'x-requested-with': 'XMLHttpRequest', - }) - break - except ExtractorError: - self._sleep(1, video_id) - - if not video_info: - raise ExtractorError('Can\'t fetch video info') - - formats = self._extract_m3u8_formats( - video_info['data']['video_url'], video_id, - entry_protocol='m3u8_native', ext='mp4') - - upload_date = unified_strdate(self._html_search_regex( - r'<em>上传时间:</em><span>([^<]+)</span>', webpage, - 'upload date', fatal=False)) - - uploader = uploader_id = uploader_url = None - mobj = re.search( - r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', - webpage) - if mobj: - uploader_id, uploader = mobj.groups() - uploader_url = urljoin(url, '/author/' + uploader_id) + formats = [] + for name, url in traverse_obj(url_info, ('data', 'thumb_video', {dict.items}, ...)): + video_url = traverse_obj(url, ('url', {url_or_none})) + if video_url: + ext = determine_ext(video_url) + formats.append({ + 'format': self._FORMATS.get(name), + 'format_id': name, + 'url': video_url, + 'quality': self._QUALITIES.get(name), + 'ext': 'mp4' if ext == 'm3u8' else ext, + 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', + **parse_resolution(self._RESOLUTIONS.get(name)) + }) + else: + self.to_screen( + f'"{self._FORMATS.get(name, name)}" format may require logging in. {self._login_hint()}') return { 'id': video_id, - 'title': room_info['name'], 'formats': formats, - 'duration': room_info.get('duration'), - 'thumbnail': room_info.get('pic'), - 'upload_date': upload_date, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, + **traverse_obj(video_info, ('DATA', { + 'title': ('content', 'title', {str}), + 'uploader': ('content', 'author', {str}), + 'uploader_id': ('content', 'up_id', {str_or_none}), + 'duration': ('content', 'video_duration', {int_or_none}), + 'thumbnail': ('content', 'video_pic', {url_or_none}), + 'timestamp': ('content', 'create_time', {int_or_none}), + 'view_count': ('content', 'view_num', {int_or_none}), + 'tags': ('videoTag', ..., 'tagName', {str}), + })) } From 5fccabac27ca3c1165ade1b0df6fbadc24258dc2 Mon Sep 17 00:00:00 2001 From: Simon <simon30002021@icloud.com> Date: Thu, 21 Sep 2023 19:37:58 +0200 Subject: [PATCH 176/218] [ie/rbgtum] Fix extraction and support new URL format (#7690) Authored by: simon300000 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/rbgtum.py | 79 ++++++++++++++++++++++++++------- 2 files changed, 65 insertions(+), 15 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 632d6720e..9cda06d8f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1601,6 +1601,7 @@ from .rbgtum import ( RbgTumIE, RbgTumCourseIE, + RbgTumNewCourseIE, ) from .rcs import ( RCSIE, diff --git a/yt_dlp/extractor/rbgtum.py b/yt_dlp/extractor/rbgtum.py index 47649cfc5..c8a331f3e 100644 --- a/yt_dlp/extractor/rbgtum.py +++ b/yt_dlp/extractor/rbgtum.py @@ -1,10 +1,11 @@ import re from .common import InfoExtractor +from ..utils import parse_qs, remove_start, traverse_obj, ExtractorError class RbgTumIE(InfoExtractor): - _VALID_URL = r'https://live\.rbg\.tum\.de/w/(?P<id>.+)' + _VALID_URL = r'https://(?:live\.rbg\.tum\.de|tum\.live)/w/(?P<id>[^?#]+)' _TESTS = [{ # Combined view 'url': 'https://live.rbg.tum.de/w/cpp/22128', @@ -35,16 +36,18 @@ class RbgTumIE(InfoExtractor): 'title': 'Fachschaftsvollversammlung', 'series': 'Fachschaftsvollversammlung Informatik', } + }, { + 'url': 'https://tum.live/w/linalginfo/27102', + 'only_matching': True, }, ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m3u8 = self._html_search_regex(r'(https://.+?\.m3u8)', webpage, 'm3u8') - lecture_title = self._html_search_regex(r'(?si)<h1.*?>(.*)</h1>', webpage, 'title') - lecture_series_title = self._html_search_regex( - r'(?s)<title\b[^>]*>\s*(?:TUM-Live\s\|\s?)?([^:]+):?.*?', webpage, 'series') + m3u8 = self._html_search_regex(r'"(https://[^"]+\.m3u8[^"]*)', webpage, 'm3u8') + lecture_title = self._html_search_regex(r']*>([^<]+)', webpage, 'title', fatal=False) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') formats = self._extract_m3u8_formats(m3u8, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') @@ -57,9 +60,9 @@ def _real_extract(self, url): class RbgTumCourseIE(InfoExtractor): - _VALID_URL = r'https://live\.rbg\.tum\.de/course/(?P.+)' + _VALID_URL = r'https://(?P(?:live\.rbg\.tum\.de|tum\.live))/old/course/(?P(?P\d+)/(?P\w+)/(?P[^/?#]+))' _TESTS = [{ - 'url': 'https://live.rbg.tum.de/course/2022/S/fpv', + 'url': 'https://live.rbg.tum.de/old/course/2022/S/fpv', 'info_dict': { 'title': 'Funktionale Programmierung und Verifikation (IN0003)', 'id': '2022/S/fpv', @@ -69,7 +72,7 @@ class RbgTumCourseIE(InfoExtractor): }, 'playlist_count': 13, }, { - 'url': 'https://live.rbg.tum.de/course/2022/W/set', + 'url': 'https://live.rbg.tum.de/old/course/2022/W/set', 'info_dict': { 'title': 'SET FSMPIC', 'id': '2022/W/set', @@ -78,16 +81,62 @@ class RbgTumCourseIE(InfoExtractor): 'noplaylist': False, }, 'playlist_count': 6, + }, { + 'url': 'https://tum.live/old/course/2023/S/linalginfo', + 'only_matching': True, }, ] def _real_extract(self, url): - course_id = self._match_id(url) - webpage = self._download_webpage(url, course_id) + course_id, hostname, year, term, slug = self._match_valid_url(url).group('id', 'hostname', 'year', 'term', 'slug') + meta = self._download_json( + f'https://{hostname}/api/courses/{slug}/', course_id, fatal=False, + query={'year': year, 'term': term}) or {} + lecture_series_title = meta.get('Name') + lectures = [self.url_result(f'https://{hostname}/w/{slug}/{stream_id}', RbgTumIE) + for stream_id in traverse_obj(meta, ('Streams', ..., 'ID'))] - lecture_series_title = self._html_search_regex(r'(?si)(.*)', webpage, 'title') + if not lectures: + webpage = self._download_webpage(url, course_id) + lecture_series_title = remove_start(self._html_extract_title(webpage), 'TUM-Live | ') + lectures = [self.url_result(f'https://{hostname}{lecture_path}', RbgTumIE) + for lecture_path in re.findall(r'href="(/w/[^/"]+/[^/"]+)"', webpage)] - lecture_urls = [] - for lecture_url in re.findall(r'(?i)href="/w/(.+)(?(?:live\.rbg\.tum\.de|tum\.live))/\?' + _TESTS = [{ + 'url': 'https://live.rbg.tum.de/?year=2022&term=S&slug=fpv&view=3', + 'info_dict': { + 'title': 'Funktionale Programmierung und Verifikation (IN0003)', + 'id': '2022/S/fpv', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 13, + }, { + 'url': 'https://live.rbg.tum.de/?year=2022&term=W&slug=set&view=3', + 'info_dict': { + 'title': 'SET FSMPIC', + 'id': '2022/W/set', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 6, + }, { + 'url': 'https://tum.live/?year=2023&term=S&slug=linalginfo&view=3', + 'only_matching': True, + }] + + def _real_extract(self, url): + query = parse_qs(url) + errors = [key for key in ('year', 'term', 'slug') if not query.get(key)] + if errors: + raise ExtractorError(f'Input URL is missing query parameters: {", ".join(errors)}') + year, term, slug = query['year'][0], query['term'][0], query['slug'][0] + hostname = self._match_valid_url(url).group('hostname') + + return self.url_result(f'https://{hostname}/old/course/{year}/{term}/{slug}', RbgTumCourseIE) From b84fda7388dd20d38921e23b469147f3957c1812 Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Thu, 21 Sep 2023 17:45:18 +0000 Subject: [PATCH 177/218] [ie/bilibili] Extract Dolby audio formats (#8142) Closes #4050 Authored by: ClosedPort22 --- yt_dlp/extractor/bilibili.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 5e7042dbb..9119f396b 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -49,14 +49,14 @@ def extract_formats(self, play_info): for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) } - audios = traverse_obj(play_info, ('dash', 'audio', ...)) + audios = traverse_obj(play_info, ('dash', (None, 'dolby'), 'audio', ..., {dict})) flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) if flac_audio: audios.append(flac_audio) formats = [{ 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), - 'acodec': audio.get('codecs'), + 'acodec': traverse_obj(audio, ('codecs', {str.lower})), 'vcodec': 'none', 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), 'filesize': int_or_none(audio.get('size')), @@ -71,6 +71,7 @@ def extract_formats(self, play_info): 'height': int_or_none(video.get('height')), 'vcodec': video.get('codecs'), 'acodec': 'none' if audios else None, + 'dynamic_range': {126: 'DV', 125: 'HDR10'}.get(int_or_none(video.get('id'))), 'tbr': float_or_none(video.get('bandwidth'), scale=1000), 'filesize': int_or_none(video.get('size')), 'quality': int_or_none(video.get('id')), From a5e264d74b4bd60c6e7ec4e38f1a23af4e420531 Mon Sep 17 00:00:00 2001 From: kylegustavo Date: Thu, 21 Sep 2023 10:46:49 -0700 Subject: [PATCH 178/218] [ie/Expressen] Improve `_VALID_URL` (#8153) Closes #8141 Authored by: kylegustavo --- yt_dlp/extractor/expressen.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/expressen.py b/yt_dlp/extractor/expressen.py index 86967b631..b96f2e4cb 100644 --- a/yt_dlp/extractor/expressen.py +++ b/yt_dlp/extractor/expressen.py @@ -11,8 +11,8 @@ class ExpressenIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?:www\.)?(?:expressen|di)\.se/ - (?:(?:tvspelare/video|videoplayer/embed)/)? - tv/(?:[^/]+/)* + (?:(?:tvspelare/video|video-?player/embed)/)? + (?:tv|nyheter)/(?:[^/?#]+/)* (?P[^/?#&]+) ''' _EMBED_REGEX = [r']+\bsrc=(["\'])(?P(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] @@ -42,6 +42,12 @@ class ExpressenIE(InfoExtractor): }, { 'url': 'https://www.di.se/videoplayer/embed/tv/ditv/borsmorgon/implantica-rusar-70--under-borspremiaren-hor-styrelsemedlemmen/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di', 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/video-player/embed/tv/nyheter/ekero-fodda-olof-gustafsson-forvaltar-knarkbaronen-pablo-escobars-namn', + 'only_matching': True, + }, { + 'url': 'https://www.expressen.se/nyheter/efter-egna-telefonbluffen-escobar-stammer-klarna/', + 'only_matching': True, }] def _real_extract(self, url): From 2269065ad60cb0ab62408ae6a7b20283e5252232 Mon Sep 17 00:00:00 2001 From: std-move <26625259+std-move@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:19:52 +0200 Subject: [PATCH 179/218] [ie/NovaEmbed] Fix extractor (#7910) Closes #8025 Authored by: std-move --- yt_dlp/extractor/nova.py | 116 +++++++++++++++------------------------ 1 file changed, 45 insertions(+), 71 deletions(-) diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index 8bd3fd472..bd0c4ebe3 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -6,7 +6,6 @@ determine_ext, int_or_none, js_to_json, - qualities, traverse_obj, unified_strdate, url_or_none, @@ -49,77 +48,52 @@ def _real_extract(self, url): duration = None formats = [] - player = self._parse_json( - self._search_regex( - (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P{.*?})\s*\)(?:\s*\))?\s*,', - r'Player\.init\s*\([^,]+,(?P\s*\w+\s*\?)?\s*(?P{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), - webpage, 'player', default='{}', group='json'), video_id, fatal=False) - if player: - for format_id, format_list in player['tracks'].items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_dict in format_list: - if not isinstance(format_dict, dict): - continue - if (not self.get_param('allow_unplayable_formats') - and traverse_obj(format_dict, ('drm', 'keySystem'))): - has_drm = True - continue - format_url = url_or_none(format_dict.get('src')) - format_type = format_dict.get('type') - ext = determine_ext(format_url) - if (format_type == 'application/x-mpegURL' - or format_id == 'HLS' or ext == 'm3u8'): - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - elif (format_type == 'application/dash+xml' - or format_id == 'DASH' or ext == 'mpd'): - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - else: - formats.append({ - 'url': format_url, - }) - duration = int_or_none(player.get('duration')) - else: - # Old path, not actual as of 08.04.2020 - bitrates = self._parse_json( - self._search_regex( - r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'), - video_id, transform_source=js_to_json) - - QUALITIES = ('lq', 'mq', 'hq', 'hd') - quality_key = qualities(QUALITIES) - - for format_id, format_list in bitrates.items(): - if not isinstance(format_list, list): - format_list = [format_list] - for format_url in format_list: - format_url = url_or_none(format_url) - if not format_url: - continue - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, ext='mp4', - entry_protocol='m3u8_native', m3u8_id='hls', - fatal=False)) - continue - f = { + def process_format_list(format_list, format_id=""): + nonlocal formats, has_drm + if not isinstance(format_list, list): + format_list = [format_list] + for format_dict in format_list: + if not isinstance(format_dict, dict): + continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue + format_url = url_or_none(format_dict.get('src')) + format_type = format_dict.get('type') + ext = determine_ext(format_url) + if (format_type == 'application/x-mpegURL' + or format_id == 'HLS' or ext == 'm3u8'): + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + elif (format_type == 'application/dash+xml' + or format_id == 'DASH' or ext == 'mpd'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ 'url': format_url, - } - f_id = format_id - for quality in QUALITIES: - if '%s.mp4' % quality in format_url: - f_id += '-%s' % quality - f.update({ - 'quality': quality_key(quality), - 'format_note': quality.upper(), - }) - break - f['format_id'] = f_id - formats.append(f) + }) + + player = self._search_json( + r'player:', webpage, 'player', video_id, fatal=False, end_pattern=r';\s*') + if player: + for src in traverse_obj(player, ('lib', 'source', 'sources', ...)): + process_format_list(src) + duration = traverse_obj(player, ('sourceInfo', 'duration', {int_or_none})) + if not formats and not has_drm: + # older code path, in use before August 2023 + player = self._parse_json( + self._search_regex( + (r'(?:(?:replacePlaceholders|processAdTagModifier).*?:\s*)?(?:replacePlaceholders|processAdTagModifier)\s*\(\s*(?P{.*?})\s*\)(?:\s*\))?\s*,', + r'Player\.init\s*\([^,]+,(?P\s*\w+\s*\?)?\s*(?P{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)'), + webpage, 'player', group='json'), video_id) + if player: + for format_id, format_list in player['tracks'].items(): + process_format_list(format_list, format_id) + duration = int_or_none(player.get('duration')) if not formats and has_drm: self.report_drm(video_id) From 52414d64ca7b92d3f83964cdd68247989b0c4625 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Sep 2023 16:51:57 -0500 Subject: [PATCH 180/218] [utils] `js_to_json`: Handle `Array` objects Authored by: Grub4K, std-move Co-authored-by: std-move <26625259+std-move@users.noreply.github.com> Co-authored-by: Simon Sawicki --- test/test_utils.py | 6 ++++++ yt_dlp/utils/_utils.py | 1 + 2 files changed, 7 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 91e3ffd39..47d1f71bf 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1218,6 +1218,12 @@ def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') self.assertEqual(js_to_json('`${name}`', {}), '"name"') + def test_js_to_json_map_array_constructors(self): + self.assertEqual(json.loads(js_to_json('new Map([["a", 5]])')), {'a': 5}) + self.assertEqual(json.loads(js_to_json('Array(5, 10)')), [5, 10]) + self.assertEqual(json.loads(js_to_json('new Array(15,5)')), [15, 5]) + self.assertEqual(json.loads(js_to_json('new Map([Array(5, 10),new Array(15,5)])')), {'5': 10, '15': 5}) + def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) self.assertEqual(extract_attributes(""), {'x': 'y'}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index ef26de116..213ccc636 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2727,6 +2727,7 @@ def fix_kv(m): def create_map(mobj): return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) + code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) if not strict: code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) From 904a19ee93195ce0bd4b08bd22b186120afb5b17 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Sep 2023 16:54:57 -0500 Subject: [PATCH 181/218] [ie] Make `_search_nuxt_data` more lenient Authored by: std-move Co-authored-by: std-move <26625259+std-move@users.noreply.github.com> --- yt_dlp/extractor/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 7deab995c..c94b4abdc 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1687,7 +1687,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' + FUNCTION_RE = r'\(function\((?P.*?)\){(?:.*?)return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), From 568f08051841aedea968258889539741e26009e9 Mon Sep 17 00:00:00 2001 From: std-move <26625259+std-move@users.noreply.github.com> Date: Fri, 22 Sep 2023 00:20:52 +0200 Subject: [PATCH 182/218] [ie/iprima] Fix extractor (#7216) Closes #7229 Authored by: std-move --- yt_dlp/extractor/iprima.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 6dec1510d..f7aa579b3 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -134,10 +134,17 @@ def _real_extract(self, url): ), webpage, 'real id', group='id', default=None) if not video_id: - nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data') + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse='data', fatal=False) video_id = traverse_obj( nuxt_data, (..., 'content', 'additionals', 'videoPlayId', {str}), get_all=False) + if not video_id: + nuxt_data = self._search_json( + r']+\bid=["\']__NUXT_DATA__["\'][^>]*>', + webpage, 'nuxt data', None, end_pattern=r'', contains_pattern=r'\[(?s:.+)\]') + + video_id = traverse_obj(nuxt_data, lambda _, v: re.fullmatch(r'p\d+', v), get_all=False) + if not video_id: self.raise_no_formats('Unable to extract video ID from webpage') From 661c9a1d029296b28e0b2f8be8a72a43abaf6536 Mon Sep 17 00:00:00 2001 From: bashonly Date: Thu, 21 Sep 2023 17:48:57 -0500 Subject: [PATCH 183/218] [test:download] Test for `expected_exception` Authored by: at-wat Co-authored-by: Atsushi Watanabe --- test/test_download.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/test_download.py b/test/test_download.py index 6f00a4ded..253079249 100755 --- a/test/test_download.py +++ b/test/test_download.py @@ -31,6 +31,7 @@ DownloadError, ExtractorError, UnavailableVideoError, + YoutubeDLError, format_bytes, join_nonempty, ) @@ -100,6 +101,8 @@ def print_skipping(reason): print_skipping('IE marked as not _WORKING') for tc in test_cases: + if tc.get('expected_exception'): + continue info_dict = tc.get('info_dict', {}) params = tc.get('params', {}) if not info_dict.get('id'): @@ -139,6 +142,17 @@ def get_tc_filename(tc): res_dict = None + def match_exception(err): + expected_exception = test_case.get('expected_exception') + if not expected_exception: + return False + if err.__class__.__name__ == expected_exception: + return True + for exc in err.exc_info: + if exc.__class__.__name__ == expected_exception: + return True + return False + def try_rm_tcs_files(tcs=None): if tcs is None: tcs = test_cases @@ -161,6 +175,8 @@ def try_rm_tcs_files(tcs=None): except (DownloadError, ExtractorError) as err: # Check if the exception is not a network related one if not isinstance(err.exc_info[1], (TransportError, UnavailableVideoError)) or (isinstance(err.exc_info[1], HTTPError) and err.exc_info[1].status == 503): + if match_exception(err): + return err.msg = f'{getattr(err, "msg", err)} ({tname})' raise @@ -171,6 +187,10 @@ def try_rm_tcs_files(tcs=None): print(f'Retrying: {try_num} failed tries\n\n##########\n\n') try_num += 1 + except YoutubeDLError as err: + if match_exception(err): + return + raise else: break From c1d71d0d9f41db5e4306c86af232f5f6220a130b Mon Sep 17 00:00:00 2001 From: Atsushi Watanabe Date: Fri, 22 Sep 2023 08:04:05 +0900 Subject: [PATCH 184/218] [ie/twitcasting] Support `--wait-for-video` (#7975) Authored by: at-wat --- yt_dlp/extractor/twitcasting.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/twitcasting.py b/yt_dlp/extractor/twitcasting.py index 3890d5d8f..540e217fd 100644 --- a/yt_dlp/extractor/twitcasting.py +++ b/yt_dlp/extractor/twitcasting.py @@ -5,8 +5,9 @@ from .common import InfoExtractor from ..dependencies import websockets from ..utils import ( - clean_html, ExtractorError, + UserNotLive, + clean_html, float_or_none, get_element_by_class, get_element_by_id, @@ -235,6 +236,9 @@ class TwitCastingLiveIE(InfoExtractor): _TESTS = [{ 'url': 'https://twitcasting.tv/ivetesangalo', 'only_matching': True, + }, { + 'url': 'https://twitcasting.tv/c:unusedlive', + 'expected_exception': 'UserNotLive', }] def _real_extract(self, url): @@ -260,7 +264,7 @@ def _real_extract(self, url): r'(?s)\d+)"\s*>.+?', webpage, 'current live ID 2', default=None, group='video_id') if not current_live: - raise ExtractorError('The user is not currently live') + raise UserNotLive(video_id=uploader_id) return self.url_result('https://twitcasting.tv/%s/movie/%s' % (uploader_id, current_live)) From c2da0b5ea215298135f76e3dc14b972a3c4afacb Mon Sep 17 00:00:00 2001 From: bashonly Date: Sat, 23 Sep 2023 14:54:00 -0500 Subject: [PATCH 185/218] [ie/ArteTV] Fix HLS formats extraction Closes #8156 Authored by: bashonly --- yt_dlp/extractor/arte.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index e3cc5afb0..a19cd2a3a 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -169,7 +169,7 @@ def _real_extract(self, url): ))) short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') - if stream['protocol'].startswith('HLS'): + if 'HLS' in stream['protocol']: fmts, subs = self._extract_m3u8_formats_and_subtitles( stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) for fmt in fmts: From 5ca095cbcde3e32642a4fe5b2d69e8e3c785a021 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Sat, 23 Sep 2023 15:00:31 -0500 Subject: [PATCH 186/218] [cleanup] Misc (#8182) Closes #7796, Closes #8028 Authored by: barsnick, sqrtNOT, gamer191, coletdjnz, Grub4K, bashonly --- CONTRIBUTING.md | 8 ++++---- README.md | 2 +- devscripts/make_changelog.py | 2 +- test/test_YoutubeDL.py | 1 - test/test_networking_utils.py | 6 +++--- yt_dlp/YoutubeDL.py | 6 +++--- yt_dlp/compat/urllib/__init__.py | 2 +- yt_dlp/extractor/abc.py | 1 - yt_dlp/extractor/ign.py | 4 ---- yt_dlp/extractor/nebula.py | 1 - yt_dlp/extractor/peekvids.py | 1 - yt_dlp/extractor/radiofrance.py | 2 +- yt_dlp/extractor/rcs.py | 6 +++--- yt_dlp/extractor/rokfin.py | 1 - yt_dlp/extractor/s4c.py | 2 -- yt_dlp/extractor/sovietscloset.py | 1 - yt_dlp/extractor/youtube.py | 2 +- yt_dlp/networking/__init__.py | 2 +- yt_dlp/networking/_urllib.py | 2 +- yt_dlp/networking/exceptions.py | 4 ++-- 20 files changed, 22 insertions(+), 34 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a8587fe92..90e7faf7c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -217,7 +217,7 @@ ## Adding support for a new site 1. Add an import in [`yt_dlp/extractor/_extractors.py`](yt_dlp/extractor/_extractors.py). Note that the class name must end with `IE`. 1. Run `python test/test_download.py TestDownload.test_YourExtractor` (note that `YourExtractor` doesn't end with `IE`). This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, the tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. You can also run all the tests in one go with `TestDownload.test_YourExtractor_all` 1. Make sure you have atleast one test for your extractor. Even if all videos covered by the extractor are expected to be inaccessible for automated testing, tests should still be added with a `skip` parameter indicating why the particular test is disabled from running. -1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L91-L426). Add tests and code for as many as you want. +1. Have a look at [`yt_dlp/extractor/common.py`](yt_dlp/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](yt_dlp/extractor/common.py#L119-L440). Add tests and code for as many as you want. 1. Make sure your code follows [yt-dlp coding conventions](#yt-dlp-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart): $ flake8 yt_dlp/extractor/yourextractor.py @@ -251,7 +251,7 @@ ## yt-dlp coding conventions ### Mandatory and optional metafields -For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L91-L426) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: +For extraction to work yt-dlp relies on metadata your extractor extracts and provides to yt-dlp expressed by an [information dictionary](yt_dlp/extractor/common.py#L119-L440) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by yt-dlp: - `id` (media identifier) - `title` (media title) @@ -696,7 +696,7 @@ #### Examples ### Use convenience conversion and parsing functions -Wrap all extracted numeric data into safe functions from [`yt_dlp/utils.py`](yt_dlp/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. +Wrap all extracted numeric data into safe functions from [`yt_dlp/utils/`](yt_dlp/utils/): `int_or_none`, `float_or_none`. Use them for string to number conversions as well. Use `url_or_none` for safe URL processing. @@ -704,7 +704,7 @@ ### Use convenience conversion and parsing functions Use `unified_strdate` for uniform `upload_date` or any `YYYYMMDD` meta field extraction, `unified_timestamp` for uniform `timestamp` extraction, `parse_filesize` for `filesize` extraction, `parse_count` for count meta fields extraction, `parse_resolution`, `parse_duration` for `duration` extraction, `parse_age_limit` for `age_limit` extraction. -Explore [`yt_dlp/utils.py`](yt_dlp/utils.py) for more useful convenience functions. +Explore [`yt_dlp/utils/`](yt_dlp/utils/) for more useful convenience functions. #### Examples diff --git a/README.md b/README.md index d94d8ea82..d9b11952d 100644 --- a/README.md +++ b/README.md @@ -1800,7 +1800,7 @@ # EXTRACTOR ARGUMENTS #### youtube * `lang`: Prefer translated metadata (`title`, `description` etc) of this language code (case-sensitive). By default, the video primary language metadata is preferred, with a fallback to `en` translated. See [youtube.py](https://github.com/yt-dlp/yt-dlp/blob/c26f9b991a0681fd3ea548d535919cec1fbbd430/yt_dlp/extractor/youtube.py#L381-L390) for list of supported content language codes * `skip`: One or more of `hls`, `dash` or `translated_subs` to skip extraction of the m3u8 manifests, dash manifests and [auto-translated subtitles](https://github.com/yt-dlp/yt-dlp/issues/4090#issuecomment-1158102032) respectively -* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. +* `player_client`: Clients to extract video data from. The main clients are `web`, `android` and `ios` with variants `_music`, `_embedded`, `_embedscreen`, `_creator` (e.g. `web_embedded`); and `mweb`, `mweb_embedscreen` and `tv_embedded` (agegate bypass) with no variants. By default, `ios,android,web` is used, but `tv_embedded` and `creator` variants are added as required for age-gated videos. Similarly, the music variants are added for `music.youtube.com` urls. You can use `all` to use all the clients, and `default` for the default clients. * `player_skip`: Skip some network requests that are generally needed for robust extraction. One or more of `configs` (skip client configs), `webpage` (skip initial webpage), `js` (skip js player). While these options can help reduce the number of requests needed or avoid some rate-limiting, they could cause some issues. See [#860](https://github.com/yt-dlp/yt-dlp/pull/860) for more details * `player_params`: YouTube player parameters to use for player requests. Will overwrite any default ones set by yt-dlp. * `comment_sort`: `top` or `new` (default) - choose comment sorting mode (on YouTube's side) diff --git a/devscripts/make_changelog.py b/devscripts/make_changelog.py index ac68dcd19..9ff65db14 100644 --- a/devscripts/make_changelog.py +++ b/devscripts/make_changelog.py @@ -260,7 +260,7 @@ class CommitRange: AUTHOR_INDICATOR_RE = re.compile(r'Authored by:? ', re.IGNORECASE) MESSAGE_RE = re.compile(r''' (?:\[(?P[^\]]+)\]\ )? - (?:(?P`?[^:`]+`?): )? + (?:(?P`?[\w.-]+`?): )? (?P.+?) (?:\ \((?P\#\d+(?:,\ \#\d+)*)\))? ''', re.VERBOSE | re.DOTALL) diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 3cfb61fb2..916ee48b9 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -631,7 +631,6 @@ def test_add_extra_info(self): self.assertEqual(test_dict['playlist'], 'funny videos') outtmpl_info = { - 'id': '1234', 'id': '1234', 'ext': 'mp4', 'width': None, diff --git a/test/test_networking_utils.py b/test/test_networking_utils.py index dbf656090..419aae1e4 100644 --- a/test/test_networking_utils.py +++ b/test/test_networking_utils.py @@ -269,14 +269,14 @@ def test_compat_http_error_autoclose(self): assert not response.closed def test_incomplete_read_error(self): - error = IncompleteRead(b'test', 3, cause='test') + error = IncompleteRead(4, 3, cause='test') assert isinstance(error, IncompleteRead) assert repr(error) == '' assert str(error) == error.msg == '4 bytes read, 3 more expected' - assert error.partial == b'test' + assert error.partial == 4 assert error.expected == 3 assert error.cause == 'test' - error = IncompleteRead(b'aaa') + error = IncompleteRead(3) assert repr(error) == '' assert str(error) == '3 bytes read' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 1feed3052..39aaf2c2e 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -239,9 +239,9 @@ class YoutubeDL: 'selected' (check selected formats), or None (check only if requested by extractor) paths: Dictionary of output paths. The allowed keys are 'home' - 'temp' and the keys of OUTTMPL_TYPES (in utils.py) + 'temp' and the keys of OUTTMPL_TYPES (in utils/_utils.py) outtmpl: Dictionary of templates for output names. Allowed keys - are 'default' and the keys of OUTTMPL_TYPES (in utils.py). + are 'default' and the keys of OUTTMPL_TYPES (in utils/_utils.py). For compatibility with youtube-dl, a single string can also be used outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names @@ -422,7 +422,7 @@ class YoutubeDL: asked whether to download the video. - Raise utils.DownloadCancelled(msg) to abort remaining downloads when a video is rejected. - match_filter_func in utils.py is one example for this. + match_filter_func in utils/_utils.py is one example for this. color: A Dictionary with output stream names as keys and their respective color policy as values. Can also just be a single color policy, diff --git a/yt_dlp/compat/urllib/__init__.py b/yt_dlp/compat/urllib/__init__.py index b27cc6133..9084b3c2b 100644 --- a/yt_dlp/compat/urllib/__init__.py +++ b/yt_dlp/compat/urllib/__init__.py @@ -1,7 +1,7 @@ # flake8: noqa: F405 from urllib import * # noqa: F403 -del request +del request # noqa: F821 from . import request # noqa: F401 from ..compat_utils import passthrough_module diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index f56133eb3..d2cf5f7c5 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -180,7 +180,6 @@ class ABCIViewIE(InfoExtractor): _VALID_URL = r'https?://iview\.abc\.net\.au/(?:[^/]+/)*video/(?P[^/?#]+)' _GEO_COUNTRIES = ['AU'] - # ABC iview programs are normally available for 14 days only. _TESTS = [{ 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', 'md5': '67715ce3c78426b11ba167d875ac6abf', diff --git a/yt_dlp/extractor/ign.py b/yt_dlp/extractor/ign.py index 64875f8ce..1c4f105e9 100644 --- a/yt_dlp/extractor/ign.py +++ b/yt_dlp/extractor/ign.py @@ -197,10 +197,6 @@ class IGNVideoIE(IGNBaseIE): 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', 'duration': 298, 'tags': 'count:13', - 'display_id': '112203', - 'thumbnail': 'https://sm.ign.com/ign_me/video/h/how-hitman/how-hitman-aims-to-be-different-than-every-other-s_8z14.jpg', - 'duration': 298, - 'tags': 'count:13', }, 'expected_warnings': ['HTTP Error 400: Bad Request'], }, { diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 4f3e691b7..8fba2bcf7 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -127,7 +127,6 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', 'uploader_id': 'lindsayellis', - 'timestamp': 1533009600, 'uploader_url': 'https://nebula.tv/lindsayellis', 'series': 'Lindsay Ellis', 'display_id': 'that-time-disney-remade-beauty-and-the-beast', diff --git a/yt_dlp/extractor/peekvids.py b/yt_dlp/extractor/peekvids.py index d1fc058b9..41f591b09 100644 --- a/yt_dlp/extractor/peekvids.py +++ b/yt_dlp/extractor/peekvids.py @@ -146,7 +146,6 @@ class PlayVidsIE(PeekVidsBaseIE): 'uploader': 'Brazzers', 'age_limit': 18, 'view_count': int, - 'age_limit': 18, 'categories': list, 'tags': list, }, diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py index 35f4b91dd..ec1b97631 100644 --- a/yt_dlp/extractor/radiofrance.py +++ b/yt_dlp/extractor/radiofrance.py @@ -82,7 +82,7 @@ class RadioFranceBaseIE(InfoExtractor): def _extract_data_from_webpage(self, webpage, display_id, key): return traverse_obj(self._search_json( r'\bconst\s+data\s*=', webpage, key, display_id, - contains_pattern=r'(\[\{.*?\}\]);', transform_source=js_to_json), + contains_pattern=r'\[\{(?s:.+)\}\]', transform_source=js_to_json), (..., 'data', key, {dict}), get_all=False) or {} diff --git a/yt_dlp/extractor/rcs.py b/yt_dlp/extractor/rcs.py index 028d3d90b..b865f63fb 100644 --- a/yt_dlp/extractor/rcs.py +++ b/yt_dlp/extractor/rcs.py @@ -239,10 +239,10 @@ class RCSEmbedsIE(RCSBaseIE): } }, { 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789', - 'match_only': True + 'only_matching': True }, { 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140', - 'match_only': True + 'only_matching': True }] _WEBPAGE_TESTS = [{ 'url': 'https://www.iodonna.it/video-iodonna/personaggi-video/monica-bellucci-piu-del-lavoro-oggi-per-me-sono-importanti-lamicizia-e-la-famiglia/', @@ -325,7 +325,7 @@ class RCSIE(RCSBaseIE): } }, { 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945', - 'match_only': True + 'only_matching': True }] diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index 4a4d40bef..cad76f0c9 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -40,7 +40,6 @@ class RokfinIE(InfoExtractor): 'channel': 'Jimmy Dore', 'channel_id': 65429, 'channel_url': 'https://rokfin.com/TheJimmyDoreShow', - 'duration': 213.0, 'availability': 'public', 'live_status': 'not_live', 'dislike_count': int, diff --git a/yt_dlp/extractor/s4c.py b/yt_dlp/extractor/s4c.py index 990ea2b44..67eff723b 100644 --- a/yt_dlp/extractor/s4c.py +++ b/yt_dlp/extractor/s4c.py @@ -78,7 +78,6 @@ class S4CSeriesIE(InfoExtractor): 'info_dict': { 'id': '864982911', 'title': 'Iaith ar Daith', - 'description': 'md5:e878ebf660dce89bd2ef521d7ce06397' }, }, { 'url': 'https://www.s4c.cymru/clic/series/866852587', @@ -86,7 +85,6 @@ class S4CSeriesIE(InfoExtractor): 'info_dict': { 'id': '866852587', 'title': 'FFIT Cymru', - 'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96' }, }] diff --git a/yt_dlp/extractor/sovietscloset.py b/yt_dlp/extractor/sovietscloset.py index 453016ccb..493eea2a6 100644 --- a/yt_dlp/extractor/sovietscloset.py +++ b/yt_dlp/extractor/sovietscloset.py @@ -76,7 +76,6 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', 'thumbnail': r're:^https?://.*\.b-cdn\.net/c0e5e76f-3a93-40b4-bf01-12343c2eec5d/thumbnail\.jpg$', - 'uploader': 'SovietWomble', 'creator': 'SovietWomble', 'release_timestamp': 1461157200, 'release_date': '20160420', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 023d8fd8c..a39d17cf1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -902,7 +902,7 @@ def extract_relative_time(relative_time_text): e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today', '8 yr ago' """ - # XXX: this could be moved to a general function in utils.py + # XXX: this could be moved to a general function in utils/_utils.py # The relative time text strings are roughly the same as what # Javascript's Intl.RelativeTimeFormat function generates. # See: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/RelativeTimeFormat diff --git a/yt_dlp/networking/__init__.py b/yt_dlp/networking/__init__.py index 5e8876484..5b1599a6d 100644 --- a/yt_dlp/networking/__init__.py +++ b/yt_dlp/networking/__init__.py @@ -1,4 +1,4 @@ -# flake8: noqa: 401 +# flake8: noqa: F401 from .common import ( HEADRequest, PUTRequest, diff --git a/yt_dlp/networking/_urllib.py b/yt_dlp/networking/_urllib.py index c327f7744..9e2bf33e4 100644 --- a/yt_dlp/networking/_urllib.py +++ b/yt_dlp/networking/_urllib.py @@ -337,7 +337,7 @@ def handle_sslerror(e: ssl.SSLError): def handle_response_read_exceptions(e): if isinstance(e, http.client.IncompleteRead): - raise IncompleteRead(partial=e.partial, cause=e, expected=e.expected) from e + raise IncompleteRead(partial=len(e.partial), cause=e, expected=e.expected) from e elif isinstance(e, ssl.SSLError): handle_sslerror(e) elif isinstance(e, (OSError, EOFError, http.client.HTTPException, *CONTENT_DECODE_ERRORS)): diff --git a/yt_dlp/networking/exceptions.py b/yt_dlp/networking/exceptions.py index 465b18ba9..f58dc246e 100644 --- a/yt_dlp/networking/exceptions.py +++ b/yt_dlp/networking/exceptions.py @@ -75,10 +75,10 @@ def __repr__(self): class IncompleteRead(TransportError): - def __init__(self, partial, expected=None, **kwargs): + def __init__(self, partial: int, expected: int = None, **kwargs): self.partial = partial self.expected = expected - msg = f'{len(partial)} bytes read' + msg = f'{partial} bytes read' if expected is not None: msg += f', {expected} more expected' From eaee21bf71889d495076037cbe590c8c0b21ef3a Mon Sep 17 00:00:00 2001 From: garret Date: Sat, 23 Sep 2023 23:13:48 +0100 Subject: [PATCH 187/218] [ie/Monstercat] Add extractor (#8133) Closes #8067 Authored by: garret1317 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/monstercat.py | 79 +++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+) create mode 100644 yt_dlp/extractor/monstercat.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9cda06d8f..691cac339 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1126,6 +1126,7 @@ MofosexEmbedIE, ) from .mojvideo import MojvideoIE +from .monstercat import MonstercatIE from .morningstar import MorningstarIE from .motherless import ( MotherlessIE, diff --git a/yt_dlp/extractor/monstercat.py b/yt_dlp/extractor/monstercat.py new file mode 100644 index 000000000..7f04825fc --- /dev/null +++ b/yt_dlp/extractor/monstercat.py @@ -0,0 +1,79 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_class, + get_element_text_and_html_by_tag, + int_or_none, + unified_strdate, + strip_or_none, + traverse_obj, + try_call, +) + + +class MonstercatIE(InfoExtractor): + _VALID_URL = r'https://www\.monstercat\.com/release/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.monstercat.com/release/742779548009', + 'playlist_count': 20, + 'info_dict': { + 'title': 'The Secret Language of Trees', + 'id': '742779548009', + 'thumbnail': 'https://www.monstercat.com/release/742779548009/cover', + 'release_year': 2023, + 'release_date': '20230711', + 'album': 'The Secret Language of Trees', + 'album_artist': 'BT', + } + }] + + def _extract_tracks(self, table, album_meta): + for td in re.findall(r'((?:(?!)[\w\W])+)', table): # regex by chatgpt due to lack of get_elements_by_tag + title = clean_html(try_call( + lambda: get_element_by_class('d-inline-flex flex-column', td).partition(' Date: Sun, 24 Sep 2023 06:15:01 +0800 Subject: [PATCH 188/218] [ie/PIAULIZAPortal] Add extractor (#7903) Authored by: pzhlkj6612 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/piaulizaportal.py | 70 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 yt_dlp/extractor/piaulizaportal.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 691cac339..49c35cf71 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1452,6 +1452,7 @@ from .phoenix import PhoenixIE from .photobucket import PhotobucketIE from .piapro import PiaproIE +from .piaulizaportal import PIAULIZAPortalIE from .picarto import ( PicartoIE, PicartoVodIE, diff --git a/yt_dlp/extractor/piaulizaportal.py b/yt_dlp/extractor/piaulizaportal.py new file mode 100644 index 000000000..1eb6d92b7 --- /dev/null +++ b/yt_dlp/extractor/piaulizaportal.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + parse_qs, + time_seconds, + traverse_obj, +) + + +class PIAULIZAPortalIE(InfoExtractor): + IE_DESC = 'ulizaportal.jp - PIA LIVE STREAM' + _VALID_URL = r'https?://(?:www\.)?ulizaportal\.jp/pages/(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ + 'url': 'https://ulizaportal.jp/pages/005f18b7-e810-5618-cb82-0987c5755d44', + 'info_dict': { + 'id': '005f18b7-e810-5618-cb82-0987c5755d44', + 'title': 'プレゼンテーションプレイヤーのサンプル', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }, { + 'url': 'https://ulizaportal.jp/pages/005e1b23-fe93-5780-19a0-98e917cc4b7d?expires=4102412400&signature=f422a993b683e1068f946caf406d211c17d1ef17da8bef3df4a519502155aa91&version=1', + 'info_dict': { + 'id': '005e1b23-fe93-5780-19a0-98e917cc4b7d', + 'title': '【確認用】視聴サンプルページ(ULIZA)', + 'live_status': 'not_live', + }, + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + expires = int_or_none(traverse_obj(parse_qs(url), ('expires', 0))) + if expires and expires <= time_seconds(): + raise ExtractorError('The link is expired.', video_id=video_id, expected=True) + + webpage = self._download_webpage(url, video_id) + + player_data = self._download_webpage( + self._search_regex( + r'' _ANVATO_PREFIX = 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + _CLIENT_DATA = { + 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', + 'clientSecret': 'CZuvCL49d9OwfGsR', + 'deviceId': str(uuid.uuid4()), + 'deviceInfo': base64.b64encode(json.dumps({ + 'model': 'desktop', + 'version': 'Chrome', + 'osName': 'Windows', + 'osVersion': '10.0', + }, separators=(',', ':')).encode()).decode(), + 'networkType': 'other', + 'nflClaimGroupsToAdd': [], + 'nflClaimGroupsToRemove': [], + } + _ACCOUNT_INFO = {} + _API_KEY = None + + _TOKEN = None + _TOKEN_EXPIRY = 0 + + def _get_account_info(self, url, slug): + if not self._API_KEY: + webpage = self._download_webpage(url, slug, fatal=False) or '' + self._API_KEY = self._search_regex( + r'window\.gigyaApiKey\s*=\s*["\'](\w+)["\'];', webpage, 'API key', + fatal=False) or '3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f' + + cookies = self._get_cookies('https://auth-id.nfl.com/') + login_token = traverse_obj(cookies, ( + (f'glt_{self._API_KEY}', lambda k, _: k.startswith('glt_')), {lambda x: x.value}), get_all=False) + if not login_token: + self.raise_login_required() + if 'ucid' not in cookies: + raise ExtractorError( + 'Required cookies for the auth-id.nfl.com domain were not found among passed cookies. ' + 'If using --cookies, these cookies must be exported along with .nfl.com cookies, ' + 'or else try using --cookies-from-browser instead', expected=True) + + account = self._download_json( + 'https://auth-id.nfl.com/accounts.getAccountInfo', slug, + note='Downloading account info', data=urlencode_postdata({ + 'include': 'profile,data', + 'lang': 'en', + 'APIKey': self._API_KEY, + 'sdk': 'js_latest', + 'login_token': login_token, + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': traverse_obj(cookies, ( + 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='15170'), + 'format': 'json', + }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + self._ACCOUNT_INFO = traverse_obj(account, { + 'signatureTimestamp': 'signatureTimestamp', + 'uid': 'UID', + 'uidSignature': 'UIDSignature', + }) + + if len(self._ACCOUNT_INFO) != 3: + raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) + + def _get_auth_token(self, url, slug): + if self._TOKEN and self._TOKEN_EXPIRY > int(time.time() + 30): + return + + if not self._ACCOUNT_INFO: + self._get_account_info(url, slug) + + token = self._download_json( + 'https://api.nfl.com/identity/v3/token%s' % ( + '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), + slug, headers={'Content-Type': 'application/json'}, note='Downloading access token', + data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) + + self._TOKEN = token['accessToken'] + self._TOKEN_EXPIRY = token['expiresIn'] + self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] + def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) item = video_config['playlist'][0] @@ -168,7 +247,7 @@ def _real_extract(self, url): class NFLPlusReplayIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:replay' - _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/[\w-]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/games/(?P[\w-]+)(?:/(?P\d+))?' _TESTS = [{ 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1/1572108', 'info_dict': { @@ -185,23 +264,92 @@ class NFLPlusReplayIE(NFLBaseIE): 'thumbnail': r're:^https?://.*\.jpg', }, 'params': {'skip_download': 'm3u8'}, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-vikings-2022-post-1', + 'playlist_count': 4, + 'info_dict': { + 'id': 'giants-at-vikings-2022-post-1', + }, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-patriots-2011-pre-4', + 'playlist_count': 2, + 'info_dict': { + 'id': 'giants-at-patriots-2011-pre-4', + }, + }, { + 'note': 'Subscription required', + 'url': 'https://www.nfl.com/plus/games/giants-at-patriots-2011-pre-4', + 'info_dict': { + 'id': '950701', + 'ext': 'mp4', + 'title': 'Giants @ Patriots', + 'description': 'Giants at Patriots on September 01, 2011', + 'uploader': 'NFL', + 'upload_date': '20210724', + 'timestamp': 1627085874, + 'duration': 1532, + 'categories': ['Game Highlights'], + 'tags': ['play-by-play'], + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'params': { + 'skip_download': 'm3u8', + 'extractor_args': {'nflplusreplay': {'type': ['condensed_game']}}, + }, }] + _REPLAY_TYPES = { + 'full_game': 'Full Game', + 'full_game_spanish': 'Full Game - Spanish', + 'condensed_game': 'Condensed Game', + 'all_22': 'All-22', + } + def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + slug, video_id = self._match_valid_url(url).group('slug', 'id') + requested_types = self._configuration_arg('type', ['all']) + if 'all' in requested_types: + requested_types = list(self._REPLAY_TYPES.keys()) + requested_types = traverse_obj(self._REPLAY_TYPES, (None, requested_types)) + + if not video_id: + self._get_auth_token(url, slug) + headers = {'Authorization': f'Bearer {self._TOKEN}'} + game_id = self._download_json( + f'https://api.nfl.com/football/v2/games/externalId/slug/{slug}', slug, + 'Downloading game ID', query={'withExternalIds': 'true'}, headers=headers)['id'] + replays = self._download_json( + 'https://api.nfl.com/content/v1/videos/replays', slug, 'Downloading replays JSON', + query={'gameId': game_id}, headers=headers) + if len(requested_types) == 1: + video_id = traverse_obj(replays, ( + 'items', lambda _, v: v['subType'] == requested_types[0], 'mcpPlaybackId'), get_all=False) + + if video_id: + return self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + def entries(): + for replay in traverse_obj( + replays, ('items', lambda _, v: v['mcpPlaybackId'] and v['subType'] in requested_types) + ): + video_id = replay['mcpPlaybackId'] + yield self.url_result(f'{self._ANVATO_PREFIX}{video_id}', AnvatoIE, video_id) + + return self.playlist_result(entries(), slug) class NFLPlusEpisodeIE(NFLBaseIE): IE_NAME = 'nfl.com:plus:episode' _VALID_URL = r'https?://(?:www\.)?nfl.com/plus/episodes/(?P[\w-]+)' _TESTS = [{ - 'note': 'premium content', + 'note': 'Subscription required', 'url': 'https://www.nfl.com/plus/episodes/kurt-s-qb-insider-conference-championships', 'info_dict': { 'id': '1576832', 'ext': 'mp4', - 'title': 'Kurt\'s QB Insider: Conference Championships', + 'title': 'Conference Championships', 'description': 'md5:944f7fab56f7a37430bf8473f5473857', 'uploader': 'NFL', 'upload_date': '20230127', @@ -214,85 +362,9 @@ class NFLPlusEpisodeIE(NFLBaseIE): 'params': {'skip_download': 'm3u8'}, }] - _CLIENT_DATA = { - 'clientKey': '4cFUW6DmwJpzT9L7LrG3qRAcABG5s04g', - 'clientSecret': 'CZuvCL49d9OwfGsR', - 'deviceId': str(uuid.uuid4()), - 'deviceInfo': base64.b64encode(json.dumps({ - 'model': 'desktop', - 'version': 'Chrome', - 'osName': 'Windows', - 'osVersion': '10.0', - }, separators=(',', ':')).encode()).decode(), - 'networkType': 'other', - 'nflClaimGroupsToAdd': [], - 'nflClaimGroupsToRemove': [], - } - _ACCOUNT_INFO = {} - _API_KEY = None - - _TOKEN = None - _TOKEN_EXPIRY = 0 - - def _get_account_info(self, url, video_id): - cookies = self._get_cookies('https://www.nfl.com/') - login_token = traverse_obj(cookies, ( - (f'glt_{self._API_KEY}', f'gig_loginToken_{self._API_KEY}', - lambda k, _: k.startswith('glt_') or k.startswith('gig_loginToken_')), - {lambda x: x.value}), get_all=False) - if not login_token: - self.raise_login_required() - - account = self._download_json( - 'https://auth-id.nfl.com/accounts.getAccountInfo', video_id, - note='Downloading account info', data=urlencode_postdata({ - 'include': 'profile,data', - 'lang': 'en', - 'APIKey': self._API_KEY, - 'sdk': 'js_latest', - 'login_token': login_token, - 'authMode': 'cookie', - 'pageURL': url, - 'sdkBuild': traverse_obj(cookies, ( - 'gig_canary_ver', {lambda x: x.value.partition('-')[0]}), default='13642'), - 'format': 'json', - }), headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - self._ACCOUNT_INFO = traverse_obj(account, { - 'signatureTimestamp': 'signatureTimestamp', - 'uid': 'UID', - 'uidSignature': 'UIDSignature', - }) - - if len(self._ACCOUNT_INFO) != 3: - raise ExtractorError('Failed to retrieve account info with provided cookies', expected=True) - - def _get_auth_token(self, url, video_id): - if not self._ACCOUNT_INFO: - self._get_account_info(url, video_id) - - token = self._download_json( - 'https://api.nfl.com/identity/v3/token%s' % ( - '/refresh' if self._ACCOUNT_INFO.get('refreshToken') else ''), - video_id, headers={'Content-Type': 'application/json'}, note='Downloading access token', - data=json.dumps({**self._CLIENT_DATA, **self._ACCOUNT_INFO}, separators=(',', ':')).encode()) - - self._TOKEN = token['accessToken'] - self._TOKEN_EXPIRY = token['expiresIn'] - self._ACCOUNT_INFO['refreshToken'] = token['refreshToken'] - def _real_extract(self, url): slug = self._match_id(url) - - if not self._API_KEY: - webpage = self._download_webpage(url, slug, fatal=False) or '' - self._API_KEY = self._search_regex( - r'window\.gigyaApiKey=["\'](\w+)["\'];', webpage, 'API key', - default='3_Qa8TkWpIB8ESCBT8tY2TukbVKgO5F6BJVc7N1oComdwFzI7H2L9NOWdm11i_BY9f') - - if not self._TOKEN or self._TOKEN_EXPIRY <= int(time.time()): - self._get_auth_token(url, slug) - + self._get_auth_token(url, slug) video_id = self._download_json( f'https://api.nfl.com/content/v1/videos/episodes/{slug}', slug, headers={ 'Authorization': f'Bearer {self._TOKEN}', From 61bdf15fc7400601c3da1aa7a43917310a5bf391 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 24 Sep 2023 02:24:47 +0200 Subject: [PATCH 193/218] [core] Raise minimum recommended Python version to 3.8 (#8183) Authored by: Grub4K --- devscripts/changelog_override.json | 5 +++++ test/test_execution.py | 3 +++ yt_dlp/YoutubeDL.py | 16 ++++------------ yt_dlp/update.py | 25 +++++++++++++++++++++++++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index e7f453acf..9dfbf510f 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -88,5 +88,10 @@ "when": "59e92b1f1833440bb2190f847eb735cf0f90bc85", "short": "[rh:urllib] Simplify gzip decoding (#7611)", "authors": ["Grub4K"] + }, + { + "action": "add", + "when": "c1d71d0d9f41db5e4306c86af232f5f6220a130b", + "short": "[priority] **The minimum *recommended* Python version has been raised to 3.8**\nSince Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803)" } ] diff --git a/test/test_execution.py b/test/test_execution.py index 7a9e800b6..fb2f6e2e9 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -45,6 +45,9 @@ def test_lazy_extractors(self): self.assertTrue(os.path.exists(LAZY_EXTRACTORS)) _, stderr = self.run_yt_dlp(opts=('-s', 'test:')) + # `MIN_RECOMMENDED` emits a deprecated feature warning for deprecated python versions + if stderr and stderr.startswith('Deprecated Feature: Support for Python'): + stderr = '' self.assertFalse(stderr) subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=subprocess.DEVNULL) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 39aaf2c2e..f322b12a2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -60,7 +60,7 @@ get_postprocessor, ) from .postprocessor.ffmpeg import resolve_mapping as resolve_recode_mapping -from .update import REPOSITORY, current_git_head, detect_variant +from .update import REPOSITORY, _get_system_deprecation, current_git_head, detect_variant from .utils import ( DEFAULT_OUTTMPL, IDENTITY, @@ -640,17 +640,9 @@ def process_color_policy(stream): for name, stream in self._out_files.items_ if name != 'console' }) - # The code is left like this to be reused for future deprecations - MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 7) - current_version = sys.version_info[:2] - if current_version < MIN_RECOMMENDED: - msg = ('Support for Python version %d.%d has been deprecated. ' - 'See https://github.com/yt-dlp/yt-dlp/issues/3764 for more details.' - '\n You will no longer receive updates on this version') - if current_version < MIN_SUPPORTED: - msg = 'Python version %d.%d is no longer supported' - self.deprecated_feature( - f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED)) + system_deprecation = _get_system_deprecation() + if system_deprecation: + self.deprecated_feature(system_deprecation.replace('\n', '\n ')) if self.params.get('allow_unplayable_formats'): self.report_warning( diff --git a/yt_dlp/update.py b/yt_dlp/update.py index d708b09e3..db79df127 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -112,6 +112,31 @@ def is_non_updateable(): detect_variant(), _NON_UPDATEABLE_REASONS['unknown' if VARIANT else 'other']) +def _get_system_deprecation(): + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 8) + + if sys.version_info > MIN_RECOMMENDED: + return None + + major, minor = sys.version_info[:2] + if sys.version_info < MIN_SUPPORTED: + msg = f'Python version {major}.{minor} is no longer supported' + else: + msg = f'Support for Python version {major}.{minor} has been deprecated. ' + # Temporary until `win_x86_exe` uses 3.8, which will deprecate Vista and Server 2008 + if detect_variant() == 'win_x86_exe': + platform_name = platform.platform() + if any(platform_name.startswith(f'Windows-{name}') for name in ('Vista', '2008Server')): + msg = 'Support for Windows Vista/Server 2008 has been deprecated. ' + else: + return None + msg += ('See https://github.com/yt-dlp/yt-dlp/issues/7803 for details.' + '\nYou may stop receiving updates on this version at any time') + + major, minor = MIN_RECOMMENDED + return f'{msg}! Please update to Python {major}.{minor} or above' + + def _sha256_file(path): h = hashlib.sha256() mv = memoryview(bytearray(128 * 1024)) From de015e930747165dbb8fcd360f8775fd973b7d6e Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 24 Sep 2023 02:29:01 +0200 Subject: [PATCH 194/218] [core] Prevent RCE when using `--exec` with `%q` (CVE-2023-40581) The shell escape function is now using `""` instead of `\"`. `utils.Popen` has been patched to properly quote commands. Prior to this fix using `--exec` together with `%q` when on Windows could cause remote code to execute. See https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg for reference. Authored by: Grub4K --- devscripts/changelog_override.json | 5 +++++ test/test_YoutubeDL.py | 6 +++--- test/test_utils.py | 16 ++++++++++++++++ yt_dlp/compat/__init__.py | 2 +- yt_dlp/postprocessor/exec.py | 12 +++++------- yt_dlp/utils/_utils.py | 18 ++++++++++++++++-- 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 9dfbf510f..fe0c82c66 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -93,5 +93,10 @@ "action": "add", "when": "c1d71d0d9f41db5e4306c86af232f5f6220a130b", "short": "[priority] **The minimum *recommended* Python version has been raised to 3.8**\nSince Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803)" + }, + { + "action": "add", + "when": "61bdf15fc7400601c3da1aa7a43917310a5bf391", + "short": "[priority] Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg)\n - The shell escape function is now using `\"\"` instead of `\\\"`.\n - `utils.Popen` has been patched to properly quote commands." } ] diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 916ee48b9..0cf130db0 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -784,9 +784,9 @@ def expect_same_infodict(out): test('%(title4)#S', 'foo_bar_test') test('%(title4).10S', ('foo "bar" ', 'foo "bar"' + ('#' if compat_os_name == 'nt' else ' '))) if compat_os_name == 'nt': - test('%(title4)q', ('"foo \\"bar\\" test"', ""foo ⧹"bar⧹" test"")) - test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', '"id 1" "id 2" "id 3"')) - test('%(formats.0.id)#q', ('"id 1"', '"id 1"')) + test('%(title4)q', ('"foo ""bar"" test"', None)) + test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', None)) + test('%(formats.0.id)#q', ('"id 1"', None)) else: test('%(title4)q', ('\'foo "bar" test\'', '\'foo "bar" test\'')) test('%(formats.:.id)#q', "'id 1' 'id 2' 'id 3'") diff --git a/test/test_utils.py b/test/test_utils.py index 47d1f71bf..dc2d8ce12 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -14,6 +14,7 @@ import io import itertools import json +import subprocess import xml.etree.ElementTree from yt_dlp.compat import ( @@ -28,6 +29,7 @@ InAdvancePagedList, LazyList, OnDemandPagedList, + Popen, age_restricted, args_to_str, base_url, @@ -2388,6 +2390,20 @@ def test_extract_basic_auth(self): assert extract_basic_auth('http://user:@foo.bar') == ('http://foo.bar', 'Basic dXNlcjo=') assert extract_basic_auth('http://user:pass@foo.bar') == ('http://foo.bar', 'Basic dXNlcjpwYXNz') + @unittest.skipUnless(compat_os_name == 'nt', 'Only relevant on Windows') + def test_Popen_windows_escaping(self): + def run_shell(args): + stdout, stderr, error = Popen.run( + args, text=True, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + assert not stderr + assert not error + return stdout + + # Test escaping + assert run_shell(['echo', 'test"&']) == '"test""&"\n' + # Test if delayed expansion is disabled + assert run_shell(['echo', '^!']) == '"^!"\n' + assert run_shell('echo "^!"') == '"^!"\n' if __name__ == '__main__': unittest.main() diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index 832a9138d..5ad5c70ec 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -30,7 +30,7 @@ def compat_etree_fromstring(text): if compat_os_name == 'nt': def compat_shlex_quote(s): import re - return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') + return s if re.match(r'^[-_\w./]+$', s) else s.replace('"', '""').join('""') else: from shlex import quote as compat_shlex_quote # noqa: F401 diff --git a/yt_dlp/postprocessor/exec.py b/yt_dlp/postprocessor/exec.py index cfc83167c..c2e73fbab 100644 --- a/yt_dlp/postprocessor/exec.py +++ b/yt_dlp/postprocessor/exec.py @@ -1,8 +1,6 @@ -import subprocess - from .common import PostProcessor from ..compat import compat_shlex_quote -from ..utils import PostProcessingError, encodeArgument, variadic +from ..utils import Popen, PostProcessingError, variadic class ExecPP(PostProcessor): @@ -27,10 +25,10 @@ def parse_cmd(self, cmd, info): def run(self, info): for tmpl in self.exec_cmd: cmd = self.parse_cmd(tmpl, info) - self.to_screen('Executing command: %s' % cmd) - retCode = subprocess.call(encodeArgument(cmd), shell=True) - if retCode != 0: - raise PostProcessingError('Command returned error code %d' % retCode) + self.to_screen(f'Executing command: {cmd}') + _, _, return_code = Popen.run(cmd, shell=True) + if return_code != 0: + raise PostProcessingError(f'Command returned error code {return_code}') return [], info diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 213ccc636..ba6242380 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -825,7 +825,7 @@ def _fix(key): _fix('LD_LIBRARY_PATH') # Linux _fix('DYLD_LIBRARY_PATH') # macOS - def __init__(self, *args, env=None, text=False, **kwargs): + def __init__(self, args, *remaining, env=None, text=False, shell=False, **kwargs): if env is None: env = os.environ.copy() self._fix_pyinstaller_ld_path(env) @@ -835,7 +835,21 @@ def __init__(self, *args, env=None, text=False, **kwargs): kwargs['universal_newlines'] = True # For 3.6 compatibility kwargs.setdefault('encoding', 'utf-8') kwargs.setdefault('errors', 'replace') - super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo) + + if shell and compat_os_name == 'nt' and kwargs.get('executable') is None: + if not isinstance(args, str): + args = ' '.join(compat_shlex_quote(a) for a in args) + shell = False + args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"' + + super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo) + + def __comspec(self): + comspec = os.environ.get('ComSpec') or os.path.join( + os.environ.get('SystemRoot', ''), 'System32', 'cmd.exe') + if os.path.isabs(comspec): + return comspec + raise FileNotFoundError('shell not found: neither %ComSpec% nor %SystemRoot% is set') def communicate_or_kill(self, *args, **kwargs): try: From 088add9567d39b758737e4299a0e619fd89d2e8f Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 24 Sep 2023 02:35:23 +0200 Subject: [PATCH 195/218] [cleanup] Misc Authored by: Grub4K --- test/test_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_utils.py b/test/test_utils.py index dc2d8ce12..fd612ff86 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2405,5 +2405,6 @@ def run_shell(args): assert run_shell(['echo', '^!']) == '"^!"\n' assert run_shell('echo "^!"') == '"^!"\n' + if __name__ == '__main__': unittest.main() From c54ddfba0f7d68034339426223d75373c5fc86df Mon Sep 17 00:00:00 2001 From: github-actions Date: Sun, 24 Sep 2023 00:38:42 +0000 Subject: [PATCH 196/218] Release 2023.09.24 Created by: Grub4K :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 +- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 +- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 +- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 +- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 +- .github/ISSUE_TEMPLATE/6_question.yml | 8 +- CONTRIBUTORS | 36 ++++ Changelog.md | 196 ++++++++++++++++++ supportedsites.md | 49 ++++- yt_dlp/version.py | 4 +- 10 files changed, 298 insertions(+), 35 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index dd1b33dde..f0fc71d57 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index 4f4378924..ac9a72a1c 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 05b4dd23b..577e4d491 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 880f1014c..9529c1bd6 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index acb11795f..b17a6e046 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index a2563e975..5345e8917 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.07.06** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.07.06 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.07.06, Current version: 2023.07.06 - yt-dlp is up to date (2023.07.06) + Latest version: 2023.09.24, Current version: 2023.09.24 + yt-dlp is up to date (2023.09.24) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6b9b9f470..72b9584ec 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -467,3 +467,39 @@ rdamas RfadnjdExt urectanc nao20010128nao/Lesmiscore +04-pasha-04 +aaruni96 +aky-01 +AmirAflak +ApoorvShah111 +at-wat +davinkevin +demon071 +denhotte +FinnRG +fireattack +Frankgoji +GD-Slime +hatsomatt +ifan-t +kshitiz305 +kylegustavo +mabdelfattah +nathantouze +niemands +Rajeshwaran2001 +RedDeffender +Rohxn16 +sb0stn +SevenLives +simon300000 +snixon +soundchaser128 +szabyg +trainman261 +trislee +wader +Yalab7 +zhallgato +zhong-yiyu +Zprokkel diff --git a/Changelog.md b/Changelog.md index 32cdaca2a..04511927f 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,202 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.09.24 + +#### Important changes +- **The minimum *recommended* Python version has been raised to 3.8** +Since Python 3.7 has reached end-of-life, support for it will be dropped soon. [Read more](https://github.com/yt-dlp/yt-dlp/issues/7803) +- Security: [[CVE-2023-40581](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2023-40581)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-42h4-v29r-42qg) + - The shell escape function is now using `""` instead of `\"`. + - `utils.Popen` has been patched to properly quote commands. + +#### Core changes +- [Fix HTTP headers and cookie handling](https://github.com/yt-dlp/yt-dlp/commit/6c5211cebeacfc53ad5d5ddf4a659be76039656f) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- [Fix `--check-formats`](https://github.com/yt-dlp/yt-dlp/commit/8cb7fc44db010e965d808ee679ef0725cb6e147c) by [pukkandan](https://github.com/pukkandan) +- [Fix support for upcoming Python 3.12](https://github.com/yt-dlp/yt-dlp/commit/836e06d246512f286f30c1371b2c54b72c9ecd93) ([#8130](https://github.com/yt-dlp/yt-dlp/issues/8130)) by [Grub4K](https://github.com/Grub4K) +- [Merged with youtube-dl 66ab08](https://github.com/yt-dlp/yt-dlp/commit/9d6254069c75877bc88bc3584f4326fb1853a543) by [coletdjnz](https://github.com/coletdjnz) +- [Prevent RCE when using `--exec` with `%q` (CVE-2023-40581)](https://github.com/yt-dlp/yt-dlp/commit/de015e930747165dbb8fcd360f8775fd973b7d6e) by [Grub4K](https://github.com/Grub4K) +- [Raise minimum recommended Python version to 3.8](https://github.com/yt-dlp/yt-dlp/commit/61bdf15fc7400601c3da1aa7a43917310a5bf391) ([#8183](https://github.com/yt-dlp/yt-dlp/issues/8183)) by [Grub4K](https://github.com/Grub4K) +- [`FFmpegFixupM3u8PP` may need to run with ffmpeg](https://github.com/yt-dlp/yt-dlp/commit/f73c11803579889dc8e1c99e25dba9a22fef39d8) by [pukkandan](https://github.com/pukkandan) +- **compat** + - [Add `types.NoneType`](https://github.com/yt-dlp/yt-dlp/commit/e0c4db04dc82a699bdabd9821ddc239ebe17d30a) by [pukkandan](https://github.com/pukkandan) (With fixes in [25b6e8f](https://github.com/yt-dlp/yt-dlp/commit/25b6e8f94679b4458550702b46e61249b875a4fd)) + - [Deprecate old functions](https://github.com/yt-dlp/yt-dlp/commit/3d2623a898196640f7cc0fc8b70118ff19e6925d) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [coletdjnz](https://github.com/coletdjnz), [pukkandan](https://github.com/pukkandan) + - [Ensure submodules are imported correctly](https://github.com/yt-dlp/yt-dlp/commit/a250b247334ce9f641e709cbb64974da6034a2b3) by [pukkandan](https://github.com/pukkandan) +- **cookies**: [Containers JSON should be opened as utf-8](https://github.com/yt-dlp/yt-dlp/commit/dab87ca23650fd87184ff5286b53e6985b59f71d) ([#7800](https://github.com/yt-dlp/yt-dlp/issues/7800)) by [bashonly](https://github.com/bashonly) +- **dependencies**: [Handle deprecation of `sqlite3.version`](https://github.com/yt-dlp/yt-dlp/commit/35f9a306e6934793cff100200cd03f288ec33f11) ([#8167](https://github.com/yt-dlp/yt-dlp/issues/8167)) by [bashonly](https://github.com/bashonly) +- **outtmpl**: [Fix replacement for `playlist_index`](https://github.com/yt-dlp/yt-dlp/commit/a264433c9fba147ecae2420091614186cfeeb895) by [pukkandan](https://github.com/pukkandan) +- **utils** + - [Add temporary shim for logging](https://github.com/yt-dlp/yt-dlp/commit/1b392f905d20ef1f1b300b180f867d43c9ce49b8) by [pukkandan](https://github.com/pukkandan) + - [Improve `parse_duration`](https://github.com/yt-dlp/yt-dlp/commit/af86873218c24c3859ccf575a87f2b00a73b49d0) by [bashonly](https://github.com/bashonly) + - HTTPHeaderDict: [Handle byte values](https://github.com/yt-dlp/yt-dlp/commit/3f7965105d8d2048359e67c1e8b8ebd51588143b) by [pukkandan](https://github.com/pukkandan) + - `clean_podcast_url`: [Handle more trackers](https://github.com/yt-dlp/yt-dlp/commit/2af4eeb77246b8183aae75a0a8d19f18c08115b2) ([#7556](https://github.com/yt-dlp/yt-dlp/issues/7556)) by [bashonly](https://github.com/bashonly), [mabdelfattah](https://github.com/mabdelfattah) + - `js_to_json`: [Handle `Array` objects](https://github.com/yt-dlp/yt-dlp/commit/52414d64ca7b92d3f83964cdd68247989b0c4625) by [Grub4K](https://github.com/Grub4K), [std-move](https://github.com/std-move) + +#### Extractor changes +- [Extract subtitles from SMIL manifests](https://github.com/yt-dlp/yt-dlp/commit/550e65410a7a1b105923494ac44460a4dc1a15d9) ([#7667](https://github.com/yt-dlp/yt-dlp/issues/7667)) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- [Fix `--load-pages`](https://github.com/yt-dlp/yt-dlp/commit/81b4712bca608b9015aa68a4d96661d56e9cb894) by [pukkandan](https://github.com/pukkandan) +- [Make `_search_nuxt_data` more lenient](https://github.com/yt-dlp/yt-dlp/commit/904a19ee93195ce0bd4b08bd22b186120afb5b17) by [std-move](https://github.com/std-move) +- **abematv** + - [Fix proxy handling](https://github.com/yt-dlp/yt-dlp/commit/497bbbbd7328cb705f70eced94dbd90993819a46) ([#8046](https://github.com/yt-dlp/yt-dlp/issues/8046)) by [SevenLives](https://github.com/SevenLives) + - [Temporary fix for protocol handler](https://github.com/yt-dlp/yt-dlp/commit/9f66247289b9f8ecf931833b3f5f127274dd2161) by [pukkandan](https://github.com/pukkandan) +- **amazonminitv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/538d37671a17e0782d17f08df17800e2e3bd57c8) by [bashonly](https://github.com/bashonly), [GautamMKGarg](https://github.com/GautamMKGarg) +- **antenna**: [Support antenna.gr](https://github.com/yt-dlp/yt-dlp/commit/665876034c8d3c031443f6b4958bed02ccdf4164) ([#7584](https://github.com/yt-dlp/yt-dlp/issues/7584)) by [stdedos](https://github.com/stdedos) +- **artetv**: [Fix HLS formats extraction](https://github.com/yt-dlp/yt-dlp/commit/c2da0b5ea215298135f76e3dc14b972a3c4afacb) by [bashonly](https://github.com/bashonly) +- **axs**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/aee6b9b88c0bcccf27fd23b7e00fc0b7b168928f) ([#8094](https://github.com/yt-dlp/yt-dlp/issues/8094)) by [barsnick](https://github.com/barsnick) +- **banbye**: [Support video ids containing a hyphen](https://github.com/yt-dlp/yt-dlp/commit/578a82e497502b951036ce9da6fe0dac6937ac27) ([#8059](https://github.com/yt-dlp/yt-dlp/issues/8059)) by [kshitiz305](https://github.com/kshitiz305) +- **bbc**: [Extract tracklist as chapters](https://github.com/yt-dlp/yt-dlp/commit/eda0e415d26eb084e570cf5372d38ee1f616b70f) ([#7788](https://github.com/yt-dlp/yt-dlp/issues/7788)) by [garret1317](https://github.com/garret1317) +- **bild.de**: [Extract HLS formats](https://github.com/yt-dlp/yt-dlp/commit/b4c1c408c63724339eb12b16c91b253a7ee62cfa) ([#8032](https://github.com/yt-dlp/yt-dlp/issues/8032)) by [barsnick](https://github.com/barsnick) +- **bilibili** + - [Add support for series, favorites and watch later](https://github.com/yt-dlp/yt-dlp/commit/9e68747f9607f05e92bb7d9b6e79d678b50070e1) ([#7518](https://github.com/yt-dlp/yt-dlp/issues/7518)) by [c-basalt](https://github.com/c-basalt) + - [Extract Dolby audio formats](https://github.com/yt-dlp/yt-dlp/commit/b84fda7388dd20d38921e23b469147f3957c1812) ([#8142](https://github.com/yt-dlp/yt-dlp/issues/8142)) by [ClosedPort22](https://github.com/ClosedPort22) + - [Extract `format_id`](https://github.com/yt-dlp/yt-dlp/commit/5336bf57a7061e0955a37f0542fc8ebf50d55b17) ([#7555](https://github.com/yt-dlp/yt-dlp/issues/7555)) by [c-basalt](https://github.com/c-basalt) +- **bilibilibangumi**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/bdd0b75e3f41ff35440eda6d395008beef19ef2f) ([#7337](https://github.com/yt-dlp/yt-dlp/issues/7337)) by [GD-Slime](https://github.com/GD-Slime) +- **bpb**: [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/f659e6439444ac64305b5c80688cd82f59d2279c) ([#8119](https://github.com/yt-dlp/yt-dlp/issues/8119)) by [Grub4K](https://github.com/Grub4K) +- **brilliantpala**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/92feb5654c5a4c81ba872904a618700fcbb3e546) ([#6680](https://github.com/yt-dlp/yt-dlp/issues/6680)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **canal1, caracoltvplay**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/b3febedbeb662dfdf9b5c1d5799039ad4fc969de) ([#7151](https://github.com/yt-dlp/yt-dlp/issues/7151)) by [elyse0](https://github.com/elyse0) +- **cbc**: [Ignore any 426 from API](https://github.com/yt-dlp/yt-dlp/commit/9bf14be775289bd88cc1f5c89fd761ae51879484) ([#7689](https://github.com/yt-dlp/yt-dlp/issues/7689)) by [makew0rld](https://github.com/makew0rld) +- **cbcplayer**: [Extract HLS formats and subtitles](https://github.com/yt-dlp/yt-dlp/commit/339c339fec095ff4141b20e6aa83629117fb26df) ([#7484](https://github.com/yt-dlp/yt-dlp/issues/7484)) by [trainman261](https://github.com/trainman261) +- **cbcplayerplaylist**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ed711897814f3ee0b1822e4205e74133467e8f1c) ([#7870](https://github.com/yt-dlp/yt-dlp/issues/7870)) by [trainman261](https://github.com/trainman261) +- **cineverse**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/15591940ff102d1ae337d603a46d8f238c83a61f) ([#8146](https://github.com/yt-dlp/yt-dlp/issues/8146)) by [garret1317](https://github.com/garret1317) +- **crunchyroll**: [Remove initial state extraction](https://github.com/yt-dlp/yt-dlp/commit/9b16762f48914de9ac914601769c76668e433325) ([#7632](https://github.com/yt-dlp/yt-dlp/issues/7632)) by [Grub4K](https://github.com/Grub4K) +- **douyutv**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/21f40e75dfc0055ea9cdbd7fe2c46c6f9b561afd) ([#7652](https://github.com/yt-dlp/yt-dlp/issues/7652)) by [c-basalt](https://github.com/c-basalt) +- **dropbox**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/b9f2bc2dbed2323734a0d18e65e1e2e23dc833d8) ([#7926](https://github.com/yt-dlp/yt-dlp/issues/7926)) by [bashonly](https://github.com/bashonly), [denhotte](https://github.com/denhotte), [nathantouze](https://github.com/nathantouze) (With fixes in [099fb1b](https://github.com/yt-dlp/yt-dlp/commit/099fb1b35cf835303306549f5113d1802d79c9c7) by [bashonly](https://github.com/bashonly)) +- **eplus**: inbound: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/295fbb3ae3a7d0dd50e286be5c487cf145ed5778) ([#5782](https://github.com/yt-dlp/yt-dlp/issues/5782)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **expressen**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/a5e264d74b4bd60c6e7ec4e38f1a23af4e420531) ([#8153](https://github.com/yt-dlp/yt-dlp/issues/8153)) by [kylegustavo](https://github.com/kylegustavo) +- **facebook** + - [Add dash manifest URL](https://github.com/yt-dlp/yt-dlp/commit/a854fbec56d5004f5147116a41d1dd050632a579) ([#7743](https://github.com/yt-dlp/yt-dlp/issues/7743)) by [ringus1](https://github.com/ringus1) + - [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/d3d81cc98f554d0adb87d24bfd6fabaaa803944d) ([#7890](https://github.com/yt-dlp/yt-dlp/issues/7890)) by [ringus1](https://github.com/ringus1) + - [Improve format sorting](https://github.com/yt-dlp/yt-dlp/commit/308936619c8a4f3a52d73c829c2006ff6c55fea2) ([#8074](https://github.com/yt-dlp/yt-dlp/issues/8074)) by [fireattack](https://github.com/fireattack) + - reel: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/bb5d84c9d2f1e978c3eddfb5ccbe138036682a36) ([#7564](https://github.com/yt-dlp/yt-dlp/issues/7564)) by [bashonly](https://github.com/bashonly), [demon071](https://github.com/demon071) +- **fox**: [Support foxsports.com](https://github.com/yt-dlp/yt-dlp/commit/30b29f37159e9226e2f2d5434c9a4096ac4efa2e) ([#7724](https://github.com/yt-dlp/yt-dlp/issues/7724)) by [ischmidt20](https://github.com/ischmidt20) +- **funker530**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/0ce1f48bf1cb78d40d734ce73ee1c90eccf92274) ([#8040](https://github.com/yt-dlp/yt-dlp/issues/8040)) by [04-pasha-04](https://github.com/04-pasha-04) +- **generic** + - [Fix KVS thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/53675852195d8dd859555d4789944a6887171ff8) by [bashonly](https://github.com/bashonly) + - [Fix generic title for embeds](https://github.com/yt-dlp/yt-dlp/commit/994f7ef8e6003f4b7b258528755d0b6adcc31714) by [pukkandan](https://github.com/pukkandan) +- **gofile**: [Update token](https://github.com/yt-dlp/yt-dlp/commit/99c99c7185f5d8e9b3699a6fc7f86ec663d7b97e) by [bashonly](https://github.com/bashonly) +- **hotstar** + - [Extract `release_year`](https://github.com/yt-dlp/yt-dlp/commit/7237c8dca0590aa7438ade93f927df88c9381ec7) ([#7869](https://github.com/yt-dlp/yt-dlp/issues/7869)) by [Rajeshwaran2001](https://github.com/Rajeshwaran2001) + - [Make metadata extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/30ea88591b728cca0896018dbf67c2298070c669) by [bashonly](https://github.com/bashonly) + - [Support `/clips/` URLs](https://github.com/yt-dlp/yt-dlp/commit/86eeb044c2342d68c6ef177577f87852e6badd85) ([#7710](https://github.com/yt-dlp/yt-dlp/issues/7710)) by [bashonly](https://github.com/bashonly) +- **hungama**: [Overhaul extractors](https://github.com/yt-dlp/yt-dlp/commit/4b3a6ef1b3e235ba9a45142830b6edb357c71696) ([#7757](https://github.com/yt-dlp/yt-dlp/issues/7757)) by [bashonly](https://github.com/bashonly), [Yalab7](https://github.com/Yalab7) +- **indavideoembed**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/63e0c5748c0eb461a2ccca4181616eb930b4b750) ([#8129](https://github.com/yt-dlp/yt-dlp/issues/8129)) by [aky-01](https://github.com/aky-01) +- **iprima**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/568f08051841aedea968258889539741e26009e9) ([#7216](https://github.com/yt-dlp/yt-dlp/issues/7216)) by [std-move](https://github.com/std-move) +- **lbry**: [Fix original format extraction](https://github.com/yt-dlp/yt-dlp/commit/127a22460658ac39cbe5c4b3fb88d578363e0dfa) ([#7711](https://github.com/yt-dlp/yt-dlp/issues/7711)) by [bashonly](https://github.com/bashonly) +- **lecturio**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/efa2339502a37cf13ae7f143bd8b2c28f452d1cd) ([#7649](https://github.com/yt-dlp/yt-dlp/issues/7649)) by [simon300000](https://github.com/simon300000) +- **magellantv**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/f4ea501551526ebcb54d19b84cf0ebe798583a85) ([#7616](https://github.com/yt-dlp/yt-dlp/issues/7616)) by [bashonly](https://github.com/bashonly) +- **massengeschmack.tv**: [Fix title extraction](https://github.com/yt-dlp/yt-dlp/commit/81f46ac573dc443ad48560f308582a26784d3015) ([#7813](https://github.com/yt-dlp/yt-dlp/issues/7813)) by [sb0stn](https://github.com/sb0stn) +- **media.ccc.de**: lists: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/cf11b40ac40e3d23a6352753296f3a732886efb9) ([#8144](https://github.com/yt-dlp/yt-dlp/issues/8144)) by [Rohxn16](https://github.com/Rohxn16) +- **mediaite**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/630a55df8de7747e79aa680959d785dfff2c4b76) ([#7923](https://github.com/yt-dlp/yt-dlp/issues/7923)) by [Grabien](https://github.com/Grabien) +- **mediaklikk**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/6e07e4bc7e59f5bdb60e93c011e57b18b009f2b5) ([#8086](https://github.com/yt-dlp/yt-dlp/issues/8086)) by [bashonly](https://github.com/bashonly), [zhallgato](https://github.com/zhallgato) +- **mediastream**: [Make embed extraction non-fatal](https://github.com/yt-dlp/yt-dlp/commit/635ae31f68a3ac7f6393d59657ed711e34ee3552) by [bashonly](https://github.com/bashonly) +- **mixcloud**: [Update API URL](https://github.com/yt-dlp/yt-dlp/commit/7b71643cc986de9a3768dac4ac9b64f4d05e7f5e) ([#8114](https://github.com/yt-dlp/yt-dlp/issues/8114)) by [garret1317](https://github.com/garret1317) +- **monstercat**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/eaee21bf71889d495076037cbe590c8c0b21ef3a) ([#8133](https://github.com/yt-dlp/yt-dlp/issues/8133)) by [garret1317](https://github.com/garret1317) +- **motortrendondemand**: [Update `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/c03a58ec9933e4a42c2d8fa80b8a0ddb2cde64e6) ([#7683](https://github.com/yt-dlp/yt-dlp/issues/7683)) by [AmirAflak](https://github.com/AmirAflak) +- **museai**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/65cfa2b057d7946fbe322155a778fe206556d0c6) ([#7614](https://github.com/yt-dlp/yt-dlp/issues/7614)) by [bashonly](https://github.com/bashonly) +- **mzaalo**: [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/d7aee8e310b2c4f21d50aac0b420e1b3abde21a4) by [bashonly](https://github.com/bashonly) +- **n1info**: article: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/8ac5b6d96ae5c60cd5ae2495949e0068a6754c45) ([#7373](https://github.com/yt-dlp/yt-dlp/issues/7373)) by [u-spec-png](https://github.com/u-spec-png) +- **nfl.com**: plus, replay: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/1eaca74bc2ca0f5b1ec532f24c61de44f2e8cb2d) ([#7838](https://github.com/yt-dlp/yt-dlp/issues/7838)) by [bashonly](https://github.com/bashonly) +- **niconicochannelplus**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/698beb9a497f51693e64d167e572ff9efa4bc25f) ([#5686](https://github.com/yt-dlp/yt-dlp/issues/5686)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nitter**: [Fix title extraction fallback](https://github.com/yt-dlp/yt-dlp/commit/a83da3717d30697102e76f63a6f29d77f9373c2a) ([#8102](https://github.com/yt-dlp/yt-dlp/issues/8102)) by [ApoorvShah111](https://github.com/ApoorvShah111) +- **noodlemagazine**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/bae4834245a708fff97219849ec880c319c88bc6) ([#7830](https://github.com/yt-dlp/yt-dlp/issues/7830)) by [RedDeffender](https://github.com/RedDeffender) (With fixes in [69dbfe0](https://github.com/yt-dlp/yt-dlp/commit/69dbfe01c47cd078682a87f179f5846e2679e927) by [bashonly](https://github.com/bashonly)) +- **novaembed**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/2269065ad60cb0ab62408ae6a7b20283e5252232) ([#7910](https://github.com/yt-dlp/yt-dlp/issues/7910)) by [std-move](https://github.com/std-move) +- **patreoncampaign**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/11de6fec9c9b8d34d1f90c8e6218ec58a3471b58) ([#7664](https://github.com/yt-dlp/yt-dlp/issues/7664)) by [bashonly](https://github.com/bashonly) +- **pbs**: [Add extractor `PBSKidsIE`](https://github.com/yt-dlp/yt-dlp/commit/6d6081dda1290a85bdab6717f239289e3aa74c8e) ([#7602](https://github.com/yt-dlp/yt-dlp/issues/7602)) by [snixon](https://github.com/snixon) +- **piapro**: [Support `/content` URL](https://github.com/yt-dlp/yt-dlp/commit/1bcb9fe8715b1f288efc322be3de409ee0597080) ([#7592](https://github.com/yt-dlp/yt-dlp/issues/7592)) by [FinnRG](https://github.com/FinnRG) +- **piaulizaportal**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/6636021206dad17c7745ae6bce6cb73d6f2ef319) ([#7903](https://github.com/yt-dlp/yt-dlp/issues/7903)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **picartovod**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/db9743894071760f994f640a4c24358f749a78c0) ([#7727](https://github.com/yt-dlp/yt-dlp/issues/7727)) by [Frankgoji](https://github.com/Frankgoji) +- **pornbox**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/40999467f72db074a3f13057da9bf82a857530fe) ([#7386](https://github.com/yt-dlp/yt-dlp/issues/7386)) by [niemands](https://github.com/niemands) +- **pornhub**: [Update access cookies for UK](https://github.com/yt-dlp/yt-dlp/commit/1d3d579c2142f69831b6ae140e1d8e824e07fa0e) ([#7591](https://github.com/yt-dlp/yt-dlp/issues/7591)) by [zhong-yiyu](https://github.com/zhong-yiyu) +- **pr0gramm**: [Rewrite extractor](https://github.com/yt-dlp/yt-dlp/commit/b532556d0a85e7d76f8f0880861232fb706ddbc5) ([#8151](https://github.com/yt-dlp/yt-dlp/issues/8151)) by [Grub4K](https://github.com/Grub4K) +- **radiofrance**: [Add support for livestreams, podcasts, playlists](https://github.com/yt-dlp/yt-dlp/commit/ba8e9eb2c8bbb699f314169fab8e544437ad731e) ([#7006](https://github.com/yt-dlp/yt-dlp/issues/7006)) by [elyse0](https://github.com/elyse0) +- **rbgtum**: [Fix extraction and support new URL format](https://github.com/yt-dlp/yt-dlp/commit/5fccabac27ca3c1165ade1b0df6fbadc24258dc2) ([#7690](https://github.com/yt-dlp/yt-dlp/issues/7690)) by [simon300000](https://github.com/simon300000) +- **reddit** + - [Extract subtitles](https://github.com/yt-dlp/yt-dlp/commit/20c3c9b433dd47faf0dbde6b46e4e34eb76109a5) by [bashonly](https://github.com/bashonly) + - [Fix thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/9a04113dfbb69b904e4e2bea736da293505786b8) by [bashonly](https://github.com/bashonly) +- **rtvslo**: [Fix format extraction](https://github.com/yt-dlp/yt-dlp/commit/94389b225d9bcf29aa7ba8afaf1bbd7c62204eae) ([#8131](https://github.com/yt-dlp/yt-dlp/issues/8131)) by [bashonly](https://github.com/bashonly) +- **rule34video**: [Extract tags](https://github.com/yt-dlp/yt-dlp/commit/58493923e9b6f774947a2131e5258e9f3cf816be) ([#7117](https://github.com/yt-dlp/yt-dlp/issues/7117)) by [soundchaser128](https://github.com/soundchaser128) +- **rumble**: [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/23d829a3420450bcfb0788e6fb2cf4f6acdbe596) ([#8035](https://github.com/yt-dlp/yt-dlp/issues/8035)) by [trislee](https://github.com/trislee) +- **s4c** + - [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/b9de629d78ce31699f2de886071dc257830f9676) ([#7730](https://github.com/yt-dlp/yt-dlp/issues/7730)) by [ifan-t](https://github.com/ifan-t) + - [Add series support and extract subs/thumbs](https://github.com/yt-dlp/yt-dlp/commit/fe371dcf0ba5ce8d42480eade54eeeac99ab3cb0) ([#7776](https://github.com/yt-dlp/yt-dlp/issues/7776)) by [ifan-t](https://github.com/ifan-t) +- **sohu**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/5be7e978867b5f66ad6786c674d79d40e950ae16) ([#7628](https://github.com/yt-dlp/yt-dlp/issues/7628)) by [bashonly](https://github.com/bashonly), [c-basalt](https://github.com/c-basalt) +- **stageplus**: [Fix m3u8 extraction](https://github.com/yt-dlp/yt-dlp/commit/56b3dc03354b75be995759d8441d2754c0442b9a) ([#7929](https://github.com/yt-dlp/yt-dlp/issues/7929)) by [bashonly](https://github.com/bashonly) +- **streamanity**: [Remove](https://github.com/yt-dlp/yt-dlp/commit/2cfe221fbbe46faa3f46552c08d947a51f424903) ([#7571](https://github.com/yt-dlp/yt-dlp/issues/7571)) by [alerikaisattera](https://github.com/alerikaisattera) +- **svtplay**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/2301b5c1b77a65abbb46b72f91e1e4666fd5d985) ([#7789](https://github.com/yt-dlp/yt-dlp/issues/7789)) by [dirkf](https://github.com/dirkf), [wader](https://github.com/wader) +- **tbsjp**: [Add episode, program, playlist extractors](https://github.com/yt-dlp/yt-dlp/commit/876b70c8edf4c0147f180bd981fbc4d625cbfb9c) ([#7765](https://github.com/yt-dlp/yt-dlp/issues/7765)) by [garret1317](https://github.com/garret1317) +- **tiktok** + - [Fix audio-only format extraction](https://github.com/yt-dlp/yt-dlp/commit/b09bd0c19648f60c59fb980cd454cb0069959fb9) ([#7712](https://github.com/yt-dlp/yt-dlp/issues/7712)) by [bashonly](https://github.com/bashonly) + - [Fix webpage extraction](https://github.com/yt-dlp/yt-dlp/commit/069cbece9dba6384f1cc5fcfc7ce562a31af42fc) by [bashonly](https://github.com/bashonly) +- **triller**: [Fix unlisted video extraction](https://github.com/yt-dlp/yt-dlp/commit/39837ae3199aa934299badbd0d63243ed639e6c8) ([#7670](https://github.com/yt-dlp/yt-dlp/issues/7670)) by [bashonly](https://github.com/bashonly) +- **tv5mondeplus**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/7d3d658f4c558ee7d72b1c01b46f2126948681cd) ([#7952](https://github.com/yt-dlp/yt-dlp/issues/7952)) by [dirkf](https://github.com/dirkf), [korli](https://github.com/korli) +- **twitcasting** + - [Improve `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/cebbd33b1c678149fc8f0e254db6fc0da317ea80) ([#8120](https://github.com/yt-dlp/yt-dlp/issues/8120)) by [c-basalt](https://github.com/c-basalt) + - [Support `--wait-for-video`](https://github.com/yt-dlp/yt-dlp/commit/c1d71d0d9f41db5e4306c86af232f5f6220a130b) ([#7975](https://github.com/yt-dlp/yt-dlp/issues/7975)) by [at-wat](https://github.com/at-wat) +- **twitter** + - [Add fallback, improve error handling](https://github.com/yt-dlp/yt-dlp/commit/6014355c6142f68e20c8374e3787e5b5820f19e2) ([#7621](https://github.com/yt-dlp/yt-dlp/issues/7621)) by [bashonly](https://github.com/bashonly) + - [Fix GraphQL and legacy API](https://github.com/yt-dlp/yt-dlp/commit/92315c03774cfabb3a921884326beb4b981f786b) ([#7516](https://github.com/yt-dlp/yt-dlp/issues/7516)) by [bashonly](https://github.com/bashonly) + - [Fix retweet extraction and syndication API](https://github.com/yt-dlp/yt-dlp/commit/a006ce2b27357c15792eb5c18f06765e640b801c) ([#8016](https://github.com/yt-dlp/yt-dlp/issues/8016)) by [bashonly](https://github.com/bashonly) + - [Revert 92315c03774cfabb3a921884326beb4b981f786b](https://github.com/yt-dlp/yt-dlp/commit/b03fa7834579a01cc5fba48c0e73488a16683d48) by [pukkandan](https://github.com/pukkandan) + - spaces + - [Fix format protocol](https://github.com/yt-dlp/yt-dlp/commit/613dbce177d34ffc31053e8e01acf4bb107bcd1e) ([#7550](https://github.com/yt-dlp/yt-dlp/issues/7550)) by [bashonly](https://github.com/bashonly) + - [Pass referer header to downloader](https://github.com/yt-dlp/yt-dlp/commit/c6ef553792ed48462f9fd0e78143bef6b1a71c2e) by [bashonly](https://github.com/bashonly) +- **unsupported**: [List more sites with DRM](https://github.com/yt-dlp/yt-dlp/commit/e7057383380d7d53815f8feaf90ca3dcbde88983) by [pukkandan](https://github.com/pukkandan) +- **videa**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/98eac0e6ba0e510ae7dfdfd249d42ee71fb272b1) ([#8003](https://github.com/yt-dlp/yt-dlp/issues/8003)) by [aky-01](https://github.com/aky-01), [hatsomatt](https://github.com/hatsomatt) +- **vrt**: [Update token signing key](https://github.com/yt-dlp/yt-dlp/commit/325191d0c9bf3fe257b8a7c2eb95080f44f6ddfc) ([#7519](https://github.com/yt-dlp/yt-dlp/issues/7519)) by [Zprokkel](https://github.com/Zprokkel) +- **wat.tv**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/7cccab79e7d00ed965b48b8cefce1da8a0513409) ([#7898](https://github.com/yt-dlp/yt-dlp/issues/7898)) by [davinkevin](https://github.com/davinkevin) +- **wdr**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/5d0395498d7065aa5e55bac85fa9354b4b0d48eb) ([#7979](https://github.com/yt-dlp/yt-dlp/issues/7979)) by [szabyg](https://github.com/szabyg) +- **web.archive**: vlive: [Remove extractor](https://github.com/yt-dlp/yt-dlp/commit/9652bca1bd02f6bc1b8cb1e186f2ccbf32225561) ([#8132](https://github.com/yt-dlp/yt-dlp/issues/8132)) by [bashonly](https://github.com/bashonly) +- **weibo**: [Fix extractor and support user extraction](https://github.com/yt-dlp/yt-dlp/commit/69b03f84f8378b0b5a2fbae56f9b7d860b2f529e) ([#7657](https://github.com/yt-dlp/yt-dlp/issues/7657)) by [c-basalt](https://github.com/c-basalt) +- **weverse**: [Support extraction without auth](https://github.com/yt-dlp/yt-dlp/commit/c2d8ee0000302aba63476b7d5bd8793e57b6c8c6) ([#7924](https://github.com/yt-dlp/yt-dlp/issues/7924)) by [seproDev](https://github.com/seproDev) +- **wimbledon**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/a15fcd299e767a510debd8dc1646fe863b96ce0e) ([#7551](https://github.com/yt-dlp/yt-dlp/issues/7551)) by [nnoboa](https://github.com/nnoboa) +- **wrestleuniverseppv**: [Fix HLS AES key extraction](https://github.com/yt-dlp/yt-dlp/commit/dae349da97cafe7357106a8f3187fd48a2ad1210) by [bashonly](https://github.com/bashonly) +- **youtube** + - [Add `player_params` extractor arg](https://github.com/yt-dlp/yt-dlp/commit/ba06d77a316650ff057347d224b5afa8b203ad65) ([#7719](https://github.com/yt-dlp/yt-dlp/issues/7719)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `player_params` arg being converted to lowercase](https://github.com/yt-dlp/yt-dlp/commit/546b2c28a106cf8101d481b215b676d1b091d276) by [coletdjnz](https://github.com/coletdjnz) + - [Fix consent cookie](https://github.com/yt-dlp/yt-dlp/commit/378ae9f9fb8e8c86e6ac89c4c5b815b48ce93620) ([#7774](https://github.com/yt-dlp/yt-dlp/issues/7774)) by [coletdjnz](https://github.com/coletdjnz) + - tab: [Detect looping feeds](https://github.com/yt-dlp/yt-dlp/commit/1ba6fe9db5f660d5538588315c23ad6cf0371c5f) ([#6621](https://github.com/yt-dlp/yt-dlp/issues/6621)) by [coletdjnz](https://github.com/coletdjnz) +- **zaiko**: [Improve thumbnail extraction](https://github.com/yt-dlp/yt-dlp/commit/ecef42c3adbcb6a84405139047923c4967316f28) ([#8054](https://github.com/yt-dlp/yt-dlp/issues/8054)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **zee5**: [Update access token endpoint](https://github.com/yt-dlp/yt-dlp/commit/a0de8bb8601146b8f87bf7cd562eef8bfb4690be) ([#7914](https://github.com/yt-dlp/yt-dlp/issues/7914)) by [bashonly](https://github.com/bashonly) +- **zoom**: [Extract duration](https://github.com/yt-dlp/yt-dlp/commit/66cc64ff6696f9921ff112a278542f8d999ffea4) by [bashonly](https://github.com/bashonly) + +#### Downloader changes +- **external** + - [Fix ffmpeg input from stdin](https://github.com/yt-dlp/yt-dlp/commit/e57eb98222d29cc4c09ee975d3c492274a6e5be3) ([#7655](https://github.com/yt-dlp/yt-dlp/issues/7655)) by [bashonly](https://github.com/bashonly) + - [Fixes to cookie handling](https://github.com/yt-dlp/yt-dlp/commit/42ded0a429c20ec13dc006825e1508d9a02f0ad4) by [bashonly](https://github.com/bashonly) + +#### Postprocessor changes +- **embedthumbnail**: [Support `m4v`](https://github.com/yt-dlp/yt-dlp/commit/8a4cd12c8f8e93292e3e95200b9d17a3af39624c) ([#7583](https://github.com/yt-dlp/yt-dlp/issues/7583)) by [Neurognostic](https://github.com/Neurognostic) + +#### Networking changes +- [Add module](https://github.com/yt-dlp/yt-dlp/commit/c365dba8430ee33abda85d31f95128605bf240eb) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [pukkandan](https://github.com/pukkandan) +- [Add request handler preference framework](https://github.com/yt-dlp/yt-dlp/commit/db7b054a6111ca387220d0eb87bf342f9c130eb8) ([#7603](https://github.com/yt-dlp/yt-dlp/issues/7603)) by [coletdjnz](https://github.com/coletdjnz) +- [Add strict Request extension checking](https://github.com/yt-dlp/yt-dlp/commit/86aea0d3a213da3be1da638b9b828e6f0ee1d59f) ([#7604](https://github.com/yt-dlp/yt-dlp/issues/7604)) by [coletdjnz](https://github.com/coletdjnz) +- [Fix POST requests with zero-length payloads](https://github.com/yt-dlp/yt-dlp/commit/71baa490ebd3655746430f208a9b605d120cd315) ([#7648](https://github.com/yt-dlp/yt-dlp/issues/7648)) by [bashonly](https://github.com/bashonly) +- [Fix `--legacy-server-connect`](https://github.com/yt-dlp/yt-dlp/commit/75dc8e673b481a82d0688aeec30f6c65d82bb359) ([#7645](https://github.com/yt-dlp/yt-dlp/issues/7645)) by [bashonly](https://github.com/bashonly) +- [Fix various socks proxy bugs](https://github.com/yt-dlp/yt-dlp/commit/20fbbd9249a2f26c7ae579bde5ba5d69aa8fac69) ([#8065](https://github.com/yt-dlp/yt-dlp/issues/8065)) by [coletdjnz](https://github.com/coletdjnz) +- [Ignore invalid proxies in env](https://github.com/yt-dlp/yt-dlp/commit/bbeacff7fcaa3b521066088a5ccbf34ef5070d1d) ([#7704](https://github.com/yt-dlp/yt-dlp/issues/7704)) by [coletdjnz](https://github.com/coletdjnz) +- [Rewrite architecture](https://github.com/yt-dlp/yt-dlp/commit/227bf1a33be7b89cd7d44ad046844c4ccba104f4) ([#2861](https://github.com/yt-dlp/yt-dlp/issues/2861)) by [coletdjnz](https://github.com/coletdjnz) +- **Request Handler** + - urllib + - [Remove dot segments during URL normalization](https://github.com/yt-dlp/yt-dlp/commit/4bf912282a34b58b6b35d8f7e6be535770c89c76) ([#7662](https://github.com/yt-dlp/yt-dlp/issues/7662)) by [coletdjnz](https://github.com/coletdjnz) + - [Simplify gzip decoding](https://github.com/yt-dlp/yt-dlp/commit/59e92b1f1833440bb2190f847eb735cf0f90bc85) ([#7611](https://github.com/yt-dlp/yt-dlp/issues/7611)) by [Grub4K](https://github.com/Grub4K) (With fixes in [77bff23](https://github.com/yt-dlp/yt-dlp/commit/77bff23ee97565bab2e0d75b893a21bf7983219a)) + +#### Misc. changes +- **build**: [Make sure deprecated modules are added](https://github.com/yt-dlp/yt-dlp/commit/131d132da5c98c6c78bd7eed4b37f4458561b3d9) by [pukkandan](https://github.com/pukkandan) +- **cleanup** + - [Add color to `download-archive` message](https://github.com/yt-dlp/yt-dlp/commit/2b029ca0a9f9105c4f7626993fa60e54c9782749) ([#5138](https://github.com/yt-dlp/yt-dlp/issues/5138)) by [aaruni96](https://github.com/aaruni96), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - Miscellaneous + - [6148833](https://github.com/yt-dlp/yt-dlp/commit/6148833f5ceb7674142ddb8d761ffe03cee7df69), [62b5c94](https://github.com/yt-dlp/yt-dlp/commit/62b5c94cadaa5f596dc1a7083db9db12efe357be) by [pukkandan](https://github.com/pukkandan) + - [5ca095c](https://github.com/yt-dlp/yt-dlp/commit/5ca095cbcde3e32642a4fe5b2d69e8e3c785a021) by [barsnick](https://github.com/barsnick), [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K), [sqrtNOT](https://github.com/sqrtNOT) + - [088add9](https://github.com/yt-dlp/yt-dlp/commit/088add9567d39b758737e4299a0e619fd89d2e8f) by [Grub4K](https://github.com/Grub4K) +- **devscripts**: `make_changelog`: [Fix changelog grouping and add networking group](https://github.com/yt-dlp/yt-dlp/commit/30ba233d4cee945756ed7344e7ddb3a90d2ae608) ([#8124](https://github.com/yt-dlp/yt-dlp/issues/8124)) by [Grub4K](https://github.com/Grub4K) +- **docs**: [Update collaborators](https://github.com/yt-dlp/yt-dlp/commit/1be0a96a4d14f629097509fcc89d15f69a8243c7) by [Grub4K](https://github.com/Grub4K) +- **test** + - [Add tests for socks proxies](https://github.com/yt-dlp/yt-dlp/commit/fcd6a76adc49d5cd8783985c7ce35384b72e545f) ([#7908](https://github.com/yt-dlp/yt-dlp/issues/7908)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `httplib_validation_errors` test for old Python versions](https://github.com/yt-dlp/yt-dlp/commit/95abea9a03289da1384e5bda3d590223ccc0a238) ([#7677](https://github.com/yt-dlp/yt-dlp/issues/7677)) by [coletdjnz](https://github.com/coletdjnz) + - [Fix `test_load_certifi`](https://github.com/yt-dlp/yt-dlp/commit/de20687ee6b742646128a7629b57096631a20619) by [pukkandan](https://github.com/pukkandan) + - download: [Test for `expected_exception`](https://github.com/yt-dlp/yt-dlp/commit/661c9a1d029296b28e0b2f8be8a72a43abaf6536) by [at-wat](https://github.com/at-wat) + ### 2023.07.06 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 379d28ef3..620e0f305 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -77,7 +77,7 @@ # Supported sites - **AnimalPlanet** - **ant1newsgr:article**: ant1news.gr articles - **ant1newsgr:embed**: ant1news.gr embedded videos - - **ant1newsgr:watch**: ant1news.gr videos + - **antenna:watch**: antenna.gr and ant1news.gr videos - **Anvato** - **aol.com**: Yahoo screen and movies - **APA** @@ -98,8 +98,6 @@ # Supported sites - **ArteTVCategory** - **ArteTVEmbed** - **ArteTVPlaylist** - - **AsianCrush** - - **AsianCrushPlaylist** - **AtresPlayer**: [*atresplayer*](## "netrc machine") - **AtScaleConfEvent** - **ATTTechChannel** @@ -118,6 +116,7 @@ # Supported sites - **awaan:live** - **awaan:season** - **awaan:video** + - **axs.tv** - **AZMedien**: AZ Medien videos - **BaiduVideo**: 百度视频 - **BanBye** @@ -162,11 +161,16 @@ # Supported sites - **BilibiliAudioAlbum** - **BiliBiliBangumi** - **BiliBiliBangumiMedia** + - **BiliBiliBangumiSeason** + - **BilibiliCollectionList** + - **BilibiliFavoritesList** - **BiliBiliPlayer** + - **BilibiliPlaylist** - **BiliBiliSearch**: Bilibili video search; "bilisearch:" prefix + - **BilibiliSeriesList** - **BilibiliSpaceAudio** - - **BilibiliSpacePlaylist** - **BilibiliSpaceVideo** + - **BilibiliWatchlater** - **BiliIntl**: [*biliintl*](## "netrc machine") - **biliIntl:series**: [*biliintl*](## "netrc machine") - **BiliLive** @@ -201,6 +205,8 @@ # Supported sites - **BreitBart** - **brightcove:legacy** - **brightcove:new** + - **Brilliantpala:Classes**: [*brilliantpala*](## "netrc machine") VoD on classes.brilliantpala.org + - **Brilliantpala:Elearn**: [*brilliantpala*](## "netrc machine") VoD on elearn.brilliantpala.org - **BRMediathek**: Bayerischer Rundfunk Mediathek - **bt:article**: Bergens Tidende Articles - **bt:vestlendingen**: Bergens Tidende - Vestlendingen @@ -220,14 +226,17 @@ # Supported sites - **Camsoda** - **CamtasiaEmbed** - **CamWithHer** + - **Canal1** - **CanalAlpha** - **canalc2.tv** - **Canalplus**: mycanal.fr and piwiplus.fr + - **CaracolTvPlay**: [*caracoltv-play*](## "netrc machine") - **CarambaTV** - **CarambaTVPage** - **CartoonNetwork** - **cbc.ca** - **cbc.ca:player** + - **cbc.ca:​player:playlist** - **CBS** - **CBSInteractive** - **CBSLocal** @@ -257,6 +266,8 @@ # Supported sites - **Cinchcast** - **Cinemax** - **CinetecaMilano** + - **Cineverse** + - **CineverseDetails** - **CiscoLiveSearch** - **CiscoLiveSession** - **ciscowebex**: Cisco Webex @@ -365,7 +376,7 @@ # Supported sites - **Dotsub** - **Douyin** - **DouyuShow** - - **DouyuTV**: 斗鱼 + - **DouyuTV**: 斗鱼直播 - **DPlay** - **DRBonanza** - **Drooble** @@ -408,6 +419,7 @@ # Supported sites - **Engadget** - **Epicon** - **EpiconSeries** + - **eplus:inbound**: e+ (イープラス) overseas - **Epoch** - **Eporner** - **EroProfile**: [*eroprofile*](## "netrc machine") @@ -732,6 +744,7 @@ # Supported sites - **lynda**: [*lynda*](## "netrc machine") lynda.com videos - **lynda:course**: [*lynda*](## "netrc machine") lynda.com online courses - **m6** + - **MagellanTV** - **MagentaMusik360** - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru @@ -812,6 +825,7 @@ # Supported sites - **Mofosex** - **MofosexEmbed** - **Mojvideo** + - **Monstercat** - **MonsterSirenHypergryphMusic** - **Morningstar**: morningstar.com - **Motherless** @@ -840,6 +854,7 @@ # Supported sites - **MujRozhlas** - **Murrtube** - **MurrtubeUser**: Murrtube user profile + - **MuseAI** - **MuseScore** - **MusicdexAlbum** - **MusicdexArtist** @@ -944,6 +959,9 @@ # Supported sites - **niconico:playlist** - **niconico:series** - **niconico:tag**: NicoNico video tag URLs + - **NiconicoChannelPlus**: ニコニコチャンネルプラス + - **NiconicoChannelPlus:​channel:lives**: ニコニコチャンネルプラス - チャンネル - ライブリスト. nicochannel.jp/channel/lives + - **NiconicoChannelPlus:​channel:videos**: ニコニコチャンネルプラス - チャンネル - 動画リスト. nicochannel.jp/channel/videos - **NiconicoUser** - **nicovideo:search**: Nico video search; "nicosearch:" prefix - **nicovideo:​search:date**: Nico video search, newest first; "nicosearchdate:" prefix @@ -1046,6 +1064,7 @@ # Supported sites - **Patreon** - **PatreonCampaign** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) + - **PBSKids** - **PearVideo** - **PeekVids** - **peer.tv** @@ -1062,6 +1081,7 @@ # Supported sites - **phoenix.de** - **Photobucket** - **Piapro**: [*piapro*](## "netrc machine") + - **PIAULIZAPortal**: ulizaportal.jp - PIA LIVE STREAM - **Picarto** - **PicartoVod** - **Piksel** @@ -1105,6 +1125,7 @@ # Supported sites - **polskieradio:​podcast:list** - **Popcorntimes** - **PopcornTV** + - **Pornbox** - **PornCom** - **PornerBros** - **Pornez** @@ -1121,7 +1142,6 @@ # Supported sites - **PornTop** - **PornTube** - **Pr0gramm** - - **Pr0grammStatic** - **PrankCast** - **PremiershipRugby** - **PressTV** @@ -1156,6 +1176,10 @@ # Supported sites - **radiocanada** - **radiocanada:audiovideo** - **radiofrance** + - **RadioFranceLive** + - **RadioFrancePodcast** + - **RadioFranceProfile** + - **RadioFranceProgramSchedule** - **RadioJavan** - **radiokapital** - **radiokapital:show** @@ -1177,6 +1201,7 @@ # Supported sites - **RayWenderlichCourse** - **RbgTum** - **RbgTumCourse** + - **RbgTumNewCourse** - **RBMARadio** - **RCS** - **RCSEmbeds** @@ -1259,6 +1284,8 @@ # Supported sites - **Ruutu** - **Ruv** - **ruv.is:spila** + - **S4C** + - **S4CSeries** - **safari**: [*safari*](## "netrc machine") safaribooksonline.com online video - **safari:api**: [*safari*](## "netrc machine") - **safari:course**: [*safari*](## "netrc machine") safaribooksonline.com online courses @@ -1325,6 +1352,7 @@ # Supported sites - **Smotrim** - **Snotr** - **Sohu** + - **SohuV** - **SonyLIV**: [*sonyliv*](## "netrc machine") - **SonyLIVSeries** - **soundcloud**: [*soundcloud*](## "netrc machine") @@ -1378,7 +1406,6 @@ # Supported sites - **StoryFireSeries** - **StoryFireUser** - **Streamable** - - **Streamanity** - **streamcloud.eu** - **StreamCZ** - **StreamFF** @@ -1403,6 +1430,9 @@ # Supported sites - **Tagesschau** - **Tass** - **TBS** + - **TBSJPEpisode** + - **TBSJPPlaylist** + - **TBSJPProgram** - **TDSLifeway** - **Teachable**: [*teachable*](## "netrc machine") - **TeachableCourse**: [*teachable*](## "netrc machine") @@ -1702,7 +1732,6 @@ # Supported sites - **wdr:mobile**: (**Currently broken**) - **WDRElefant** - **WDRPage** - - **web.archive:vlive**: web.archive.org saved vlive videos - **web.archive:youtube**: web.archive.org saved youtube videos, "ytarchive:" prefix - **Webcamerapl** - **Webcaster** @@ -1710,7 +1739,8 @@ # Supported sites - **WebOfStories** - **WebOfStoriesPlaylist** - **Weibo** - - **WeiboMobile** + - **WeiboUser** + - **WeiboVideo** - **WeiqiTV**: WQTV - **wetv:episode** - **WeTvSeries** @@ -1726,6 +1756,7 @@ # Supported sites - **Whyp** - **wikimedia.org** - **Willow** + - **Wimbledon** - **WimTV** - **WinSportsVideo** - **Wistia** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 67cfe44ef..2a7c84b93 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.07.06' +__version__ = '2023.09.24' -RELEASE_GIT_HEAD = 'b532a3481046e1eabb6232ee8196fb696c356ff6' +RELEASE_GIT_HEAD = '088add9567d39b758737e4299a0e619fd89d2e8f' VARIANT = None From eb5bdbfa70126c7d5355cc0954b63720522e462c Mon Sep 17 00:00:00 2001 From: coletdjnz Date: Tue, 3 Oct 2023 19:42:30 +1300 Subject: [PATCH 197/218] [ie/youtube] Raise a warning for `Incomplete Data` instead of an error (#8238) Closes https://github.com/yt-dlp/yt-dlp/issues/8206 Adds `raise_incomplete_data` extractor arg to revert this behaviour and raise an error. Authored by: coletdjnz Co-authored-by: Simon Sawicki --- README.md | 1 + yt_dlp/extractor/youtube.py | 26 +++++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7bf446572..a0b69c9a1 100644 --- a/README.md +++ b/README.md @@ -1809,6 +1809,7 @@ #### youtube * `formats`: Change the types of formats to return. `dashy` (convert HTTP to DASH), `duplicate` (identical content but different URLs or protocol; includes `dashy`), `incomplete` (cannot be downloaded completely - live dash and post-live m3u8) * `innertube_host`: Innertube API host to use for all API requests; e.g. `studio.youtube.com`, `youtubei.googleapis.com`. Note that cookies exported from one subdomain will not work on others * `innertube_key`: Innertube API key to use for all API requests +* `raise_incomplete_data`: `Incomplete Data Received` raises an error instead of reporting a warning #### youtubetab (YouTube playlists, channels, feeds, etc.) * `skip`: One or more of `webpage` (skip initial webpage download), `authcheck` (allow the download of playlists requiring authentication when no initial webpage is downloaded. This may cause unwanted behavior, see [#1122](https://github.com/yt-dlp/yt-dlp/pull/1122) for more details) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index a39d17cf1..7e13aa779 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -941,7 +941,13 @@ def _parse_time_text(self, text): def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): - for retry in self.RetryManager(): + raise_for_incomplete = bool(self._configuration_arg('raise_incomplete_data', ie_key=YoutubeIE)) + # Incomplete Data should be a warning by default when retries are exhausted, while other errors should be fatal. + icd_retries = iter(self.RetryManager(fatal=raise_for_incomplete)) + icd_rm = next(icd_retries) + main_retries = iter(self.RetryManager()) + main_rm = next(main_retries) + for _ in range(main_rm.retries + icd_rm.retries + 1): try: response = self._call_api( ep=ep, fatal=True, headers=headers, @@ -953,7 +959,8 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers if not isinstance(e.cause, network_exceptions): return self._error_or_warning(e, fatal=fatal) elif not isinstance(e.cause, HTTPError): - retry.error = e + main_rm.error = e + next(main_retries) continue first_bytes = e.cause.response.read(512) @@ -965,27 +972,32 @@ def _extract_response(self, item_id, query, note='Downloading API JSON', headers if yt_error: self._report_alerts([('ERROR', yt_error)], fatal=False) # Downloading page may result in intermittent 5xx HTTP error - # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # Sometimes a 404 is also received. See: https://github.com/ytdl-org/youtube-dl/issues/28289 # We also want to catch all other network exceptions since errors in later pages can be troublesome # See https://github.com/yt-dlp/yt-dlp/issues/507#issuecomment-880188210 if e.cause.status not in (403, 429): - retry.error = e + main_rm.error = e + next(main_retries) continue return self._error_or_warning(e, fatal=fatal) try: self._extract_and_report_alerts(response, only_once=True) except ExtractorError as e: - # YouTube servers may return errors we want to retry on in a 200 OK response + # YouTube's servers may return errors we want to retry on in a 200 OK response # See: https://github.com/yt-dlp/yt-dlp/issues/839 if 'unknown error' in e.msg.lower(): - retry.error = e + main_rm.error = e + next(main_retries) continue return self._error_or_warning(e, fatal=fatal) # Youtube sometimes sends incomplete data # See: https://github.com/ytdl-org/youtube-dl/issues/28194 if not traverse_obj(response, *variadic(check_get_keys)): - retry.error = ExtractorError('Incomplete data received', expected=True) + icd_rm.error = ExtractorError('Incomplete data received', expected=True) + should_retry = next(icd_retries, None) + if not should_retry: + return None continue return response From cc8d8441524ec3442d7c0d3f8f33f15b66aa06f3 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Tue, 3 Oct 2023 11:33:40 +0200 Subject: [PATCH 198/218] [ie/xhamster:user] Support creator urls (#8232) Authored by: Grub4K --- yt_dlp/extractor/xhamster.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/yt_dlp/extractor/xhamster.py b/yt_dlp/extractor/xhamster.py index 37224799b..aec1f20bb 100644 --- a/yt_dlp/extractor/xhamster.py +++ b/yt_dlp/extractor/xhamster.py @@ -407,7 +407,7 @@ def _real_extract(self, url): class XHamsterUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?%s/users/(?P[^/?#&]+)' % XHamsterIE._DOMAINS + _VALID_URL = rf'https?://(?:[^/?#]+\.)?{XHamsterIE._DOMAINS}/(?:(?Pusers)|creators)/(?P[^/?#&]+)' _TESTS = [{ # Paginated user profile 'url': 'https://xhamster.com/users/netvideogirls/videos', @@ -422,6 +422,12 @@ class XHamsterUserIE(InfoExtractor): 'id': 'firatkaan', }, 'playlist_mincount': 1, + }, { + 'url': 'https://xhamster.com/creators/squirt-orgasm-69', + 'info_dict': { + 'id': 'squirt-orgasm-69', + }, + 'playlist_mincount': 150, }, { 'url': 'https://xhday.com/users/mobhunter', 'only_matching': True, @@ -430,8 +436,9 @@ class XHamsterUserIE(InfoExtractor): 'only_matching': True, }] - def _entries(self, user_id): - next_page_url = 'https://xhamster.com/users/%s/videos/1' % user_id + def _entries(self, user_id, is_user): + prefix, suffix = ('users', 'videos') if is_user else ('creators', 'exclusive') + next_page_url = f'https://xhamster.com/{prefix}/{user_id}/{suffix}/1' for pagenum in itertools.count(1): page = self._download_webpage( next_page_url, user_id, 'Downloading page %s' % pagenum) @@ -454,5 +461,5 @@ def _entries(self, user_id): break def _real_extract(self, url): - user_id = self._match_id(url) - return self.playlist_result(self._entries(user_id), user_id) + user, user_id = self._match_valid_url(url).group('user', 'id') + return self.playlist_result(self._entries(user_id, bool(user)), user_id) From 0730d5a966fa8a937d84bfb7f68be5198acb039b Mon Sep 17 00:00:00 2001 From: bashonly Date: Wed, 4 Oct 2023 12:44:13 -0500 Subject: [PATCH 199/218] [ie/gofile] Fix token cookie bug Authored by: bashonly --- yt_dlp/extractor/gofile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py index 898390583..ef14b57d0 100644 --- a/yt_dlp/extractor/gofile.py +++ b/yt_dlp/extractor/gofile.py @@ -60,7 +60,7 @@ def _real_initialize(self): account_data = self._download_json( 'https://api.gofile.io/createAccount', None, note='Getting a new guest account') self._TOKEN = account_data['data']['token'] - self._set_cookie('gofile.io', 'accountToken', self._TOKEN) + self._set_cookie('.gofile.io', 'accountToken', self._TOKEN) def _entries(self, file_id): query_params = { From b095fd3fa9d58a65dc9b830bd63b9d909422aa86 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Wed, 4 Oct 2023 13:01:52 -0500 Subject: [PATCH 200/218] [ie/WrestleUniverseVOD] Call API with device ID (#8272) Closes #8271 Authored by: bashonly --- yt_dlp/extractor/wrestleuniverse.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index dd12804db..145246a14 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -190,10 +190,7 @@ class WrestleUniverseVODIE(WrestleUniverseBaseIE): def _real_extract(self, url): lang, video_id = self._match_valid_url(url).group('lang', 'id') metadata = self._download_metadata(url, video_id, lang, 'videoEpisodeFallbackData') - video_data = self._call_api(video_id, ':watch', 'watch', data={ - # 'deviceId' is required if ignoreDeviceRestriction is False - 'ignoreDeviceRestriction': True, - }) + video_data = self._call_api(video_id, ':watch', 'watch', data={'deviceId': self._DEVICE_ID}) return { 'id': video_id, From 91a670a4f7babe9c8aa2018f57d8c8952a6f49d8 Mon Sep 17 00:00:00 2001 From: gillux Date: Sat, 7 Oct 2023 06:27:54 +0800 Subject: [PATCH 201/218] [ie/LiTV] Fix extractor (#7785) Closes #5456 Authored by: jiru --- yt_dlp/extractor/litv.py | 48 ++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/litv.py b/yt_dlp/extractor/litv.py index 19b298ec6..2c7c7175e 100644 --- a/yt_dlp/extractor/litv.py +++ b/yt_dlp/extractor/litv.py @@ -13,7 +13,7 @@ class LiTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?litv\.tv/(?:vod|promo)/[^/]+/(?:content\.do)?\?.*?\b(?:content_)?id=(?P[^&]+)' - _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?id=%s' + _URL_TEMPLATE = 'https://www.litv.tv/vod/%s/content.do?content_id=%s' _TESTS = [{ 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', @@ -21,16 +21,18 @@ class LiTVIE(InfoExtractor): 'id': 'VOD00041606', 'title': '花千骨', }, - 'playlist_count': 50, + 'playlist_count': 51, # 50 episodes + 1 trailer }, { 'url': 'https://www.litv.tv/vod/drama/content.do?brc_id=root&id=VOD00041610&isUHEnabled=true&autoPlay=1', - 'md5': '969e343d9244778cb29acec608e53640', + 'md5': 'b90ff1e9f1d8f5cfcd0a44c3e2b34c7a', 'info_dict': { 'id': 'VOD00041610', 'ext': 'mp4', 'title': '花千骨第1集', 'thumbnail': r're:https?://.*\.jpg$', - 'description': 'md5:c7017aa144c87467c4fb2909c4b05d6f', + 'description': '《花千骨》陸劇線上看。十六年前,平靜的村莊內,一名女嬰隨異相出生,途徑此地的蜀山掌門清虛道長算出此女命運非同一般,她體內散發的異香易招惹妖魔。一念慈悲下,他在村莊周邊設下結界阻擋妖魔入侵,讓其年滿十六後去蜀山,並賜名花千骨。', + 'categories': ['奇幻', '愛情', '中國', '仙俠'], + 'episode': 'Episode 1', 'episode_number': 1, }, 'params': { @@ -46,20 +48,17 @@ class LiTVIE(InfoExtractor): 'title': '芈月傳第1集 霸星芈月降世楚國', 'description': '楚威王二年,太史令唐昧夜觀星象,發現霸星即將現世。王后得知霸星的預言後,想盡辦法不讓孩子順利出生,幸得莒姬相護化解危機。沒想到眾人期待下出生的霸星卻是位公主,楚威王對此失望至極。楚王后命人將女嬰丟棄河中,居然奇蹟似的被少司命像攔下,楚威王認為此女非同凡響,為她取名芈月。', }, - 'skip': 'Georestricted to Taiwan', + 'skip': 'No longer exists', }] - def _extract_playlist(self, season_list, video_id, program_info, prompt=True): - episode_title = program_info['title'] - content_id = season_list['contentId'] - + def _extract_playlist(self, playlist_data, content_type): all_episodes = [ self.url_result(smuggle_url( - self._URL_TEMPLATE % (program_info['contentType'], episode['contentId']), + self._URL_TEMPLATE % (content_type, episode['contentId']), {'force_noplaylist': True})) # To prevent infinite recursion - for episode in season_list['episode']] + for episode in traverse_obj(playlist_data, ('seasons', ..., 'episode', lambda _, v: v['contentId']))] - return self.playlist_result(all_episodes, content_id, episode_title) + return self.playlist_result(all_episodes, playlist_data['contentId'], playlist_data.get('title')) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -68,24 +67,31 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) + if self._search_regex( + r'(?i)]*http-equiv="refresh"\s[^>]*content="[0-9]+;\s*url=https://www\.litv\.tv/"', + webpage, 'meta refresh redirect', default=False, group=0): + raise ExtractorError('No such content found', expected=True) + program_info = self._parse_json(self._search_regex( r'var\s+programInfo\s*=\s*([^;]+)', webpage, 'VOD data', default='{}'), video_id) - season_list = list(program_info.get('seasonList', {}).values()) - playlist_id = traverse_obj(season_list, 0, 'contentId') - if self._yes_playlist(playlist_id, video_id, smuggled_data): - return self._extract_playlist(season_list[0], video_id, program_info) - - # In browsers `getMainUrl` request is always issued. Usually this + # In browsers `getProgramInfo` request is always issued. Usually this # endpoint gives the same result as the data embedded in the webpage. - # If georestricted, there are no embedded data, so an extra request is - # necessary to get the error code + # If, for some reason, there are no embedded data, we do an extra request. if 'assetId' not in program_info: program_info = self._download_json( 'https://www.litv.tv/vod/ajax/getProgramInfo', video_id, query={'contentId': video_id}, headers={'Accept': 'application/json'}) + + series_id = program_info['seriesId'] + if self._yes_playlist(series_id, video_id, smuggled_data): + playlist_data = self._download_json( + 'https://www.litv.tv/vod/ajax/getSeriesTree', video_id, + query={'seriesId': series_id}, headers={'Accept': 'application/json'}) + return self._extract_playlist(playlist_data, program_info['contentType']) + video_data = self._parse_json(self._search_regex( r'uiHlsUrl\s*=\s*testBackendData\(([^;]+)\);', webpage, 'video data', default='{}'), video_id) @@ -96,7 +102,7 @@ def _real_extract(self, url): 'contentType': program_info['contentType'], } video_data = self._download_json( - 'https://www.litv.tv/vod/getMainUrl', video_id, + 'https://www.litv.tv/vod/ajax/getMainUrlNoAuth', video_id, data=json.dumps(payload).encode('utf-8'), headers={'Content-Type': 'application/json'}) From f980df734cf5c0eaded2f7b38c6c60bccfeebb48 Mon Sep 17 00:00:00 2001 From: c-basalt <117849907+c-basalt@users.noreply.github.com> Date: Fri, 6 Oct 2023 18:31:33 -0400 Subject: [PATCH 202/218] [ie/neteasemusic] Fix extractors (#8181) Closes #4388 Authored by: c-basalt --- yt_dlp/extractor/neteasemusic.py | 575 +++++++++++++++++-------------- 1 file changed, 312 insertions(+), 263 deletions(-) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index 5b7307bc8..68bfcb6ba 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -2,105 +2,74 @@ import json import re import time -from base64 import b64encode -from binascii import hexlify -from datetime import datetime from hashlib import md5 from random import randint from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding -from ..compat import compat_urllib_parse_urlencode -from ..networking import Request from ..utils import ( ExtractorError, - bytes_to_intlist, - error_to_compat_str, - float_or_none, int_or_none, - intlist_to_bytes, - try_get, + join_nonempty, + str_or_none, + strftime_or_none, + traverse_obj, + unified_strdate, + url_or_none, + urljoin, + variadic, ) class NetEaseMusicBaseIE(InfoExtractor): _FORMATS = ['bMusic', 'mMusic', 'hMusic'] - _NETEASE_SALT = '3go8&$8*3*3h0k(2)2' _API_BASE = 'http://music.163.com/api/' + _GEO_BYPASS = False - @classmethod - def _encrypt(cls, dfsid): - salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) - string_bytes = bytearray(str(dfsid).encode('ascii')) - salt_len = len(salt_bytes) - for i in range(len(string_bytes)): - string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] - m = md5() - m.update(bytes(string_bytes)) - result = b64encode(m.digest()).decode('ascii') - return result.replace('/', '_').replace('+', '-') + @staticmethod + def kilo_or_none(value): + return int_or_none(value, scale=1000) - def make_player_api_request_data_and_headers(self, song_id, bitrate): - KEY = b'e82ckenh8dichen8' - URL = '/api/song/enhance/player/url' - now = int(time.time() * 1000) - rand = randint(0, 1000) - cookie = { - 'osver': None, - 'deviceId': None, + def _create_eapi_cipher(self, api_path, query_body, cookies): + request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':')) + + message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1') + msg_digest = md5(message).hexdigest() + + data = pkcs7_padding(list(str.encode( + f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}'))) + encrypted = bytes(aes_ecb_encrypt(data, list(b'e82ckenh8dichen8'))) + return f'params={encrypted.hex().upper()}'.encode() + + def _download_eapi_json(self, path, video_id, query_body, headers={}, **kwargs): + cookies = { + 'osver': 'undefined', + 'deviceId': 'undefined', 'appver': '8.0.0', 'versioncode': '140', - 'mobilename': None, + 'mobilename': 'undefined', 'buildver': '1623435496', 'resolution': '1920x1080', '__csrf': '', 'os': 'pc', - 'channel': None, - 'requestId': '{0}_{1:04}'.format(now, rand), + 'channel': 'undefined', + 'requestId': f'{int(time.time() * 1000)}_{randint(0, 1000):04}', + **traverse_obj(self._get_cookies(self._API_BASE), { + 'MUSIC_U': ('MUSIC_U', {lambda i: i.value}), + }) } - request_text = json.dumps( - {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, - separators=(',', ':')) - message = 'nobody{0}use{1}md5forencrypt'.format( - URL, request_text).encode('latin1') - msg_digest = md5(message).hexdigest() - - data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( - URL, request_text, msg_digest) - data = pkcs7_padding(bytes_to_intlist(data)) - encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) - encrypted_params = hexlify(encrypted).decode('ascii').upper() - - cookie = '; '.join( - ['{0}={1}'.format(k, v if v is not None else 'undefined') - for [k, v] in cookie.items()]) - - headers = { - 'User-Agent': self.extractor.get_param('http_headers')['User-Agent'], - 'Content-Type': 'application/x-www-form-urlencoded', - 'Referer': 'https://music.163.com', - 'Cookie': cookie, - } - return ('params={0}'.format(encrypted_params), headers) + return self._download_json( + urljoin('https://interface3.music.163.com/', f'/eapi{path}'), video_id, + data=self._create_eapi_cipher(f'/api{path}', query_body, cookies), headers={ + 'Referer': 'https://music.163.com', + 'Cookie': '; '.join([f'{k}={v}' for k, v in cookies.items()]), + **headers, + }, **kwargs) def _call_player_api(self, song_id, bitrate): - url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' - data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) - try: - msg = 'empty result' - result = self._download_json( - url, song_id, data=data.encode('ascii'), headers=headers) - if result: - return result - except ExtractorError as e: - if type(e.cause) in (ValueError, TypeError): - # JSON load failure - raise - except Exception as e: - msg = error_to_compat_str(e) - self.report_warning('%s API call (%s) failed: %s' % ( - song_id, bitrate, msg)) - return {} + return self._download_eapi_json( + '/song/enhance/player/url', song_id, {'ids': f'[{song_id}]', 'br': bitrate}, + note=f'Downloading song URL info: bitrate {bitrate}') def extract_formats(self, info): err = 0 @@ -110,45 +79,50 @@ def extract_formats(self, info): details = info.get(song_format) if not details: continue - bitrate = int_or_none(details.get('bitrate')) or 999000 - data = self._call_player_api(song_id, bitrate) - for song in try_get(data, lambda x: x['data'], list) or []: - song_url = try_get(song, lambda x: x['url']) - if not song_url: - continue + for song in traverse_obj(self._call_player_api(song_id, bitrate), ('data', lambda _, v: url_or_none(v['url']))): + song_url = song['url'] if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, - 'ext': details.get('extension'), - 'abr': float_or_none(song.get('br'), scale=1000), 'format_id': song_format, - 'filesize': int_or_none(song.get('size')), - 'asr': int_or_none(details.get('sr')), + 'asr': traverse_obj(details, ('sr', {int_or_none})), + **traverse_obj(song, { + 'ext': ('type', {str}), + 'abr': ('br', {self.kilo_or_none}), + 'filesize': ('size', {int_or_none}), + }), }) elif err == 0: - err = try_get(song, lambda x: x['code'], int) + err = traverse_obj(song, ('code', {int})) or 0 if not formats: - msg = 'No media links found' if err != 0 and (err < 200 or err >= 400): - raise ExtractorError( - '%s (site code %d)' % (msg, err, ), expected=True) + raise ExtractorError(f'No media links found (site code {err})', expected=True) else: self.raise_geo_restricted( - msg + ': probably this video is not available from your location due to geo restriction.', - countries=['CN']) - + 'No media links found: probably due to geo restriction.', countries=['CN']) return formats - @classmethod - def convert_milliseconds(cls, ms): - return int(round(ms / 1000.0)) - def query_api(self, endpoint, video_id, note): - req = Request('%s%s' % (self._API_BASE, endpoint)) - req.headers['Referer'] = self._API_BASE - return self._download_json(req, video_id, note) + result = self._download_json( + f'{self._API_BASE}{endpoint}', video_id, note, headers={'Referer': self._API_BASE}) + code = traverse_obj(result, ('code', {int})) + message = traverse_obj(result, ('message', {str})) or '' + if code == -462: + self.raise_login_required(f'Login required to download: {message}') + elif code != 200: + raise ExtractorError(f'Failed to get meta info: {code} {message}') + return result + + def _get_entries(self, songs_data, entry_keys=None, id_key='id', name_key='name'): + for song in traverse_obj(songs_data, ( + *variadic(entry_keys, (str, bytes, dict, set)), + lambda _, v: int_or_none(v[id_key]) is not None)): + song_id = str(song[id_key]) + yield self.url_result( + f'http://music.163.com/#/song?id={song_id}', NetEaseMusicIE, + song_id, traverse_obj(song, (name_key, {str}))) class NetEaseMusicIE(NetEaseMusicBaseIE): @@ -156,16 +130,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): IE_DESC = '网易云音乐' _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P[0-9]+)' _TESTS = [{ - 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': '3e909614ce09b1ccef4a3eb205441190', + 'url': 'https://music.163.com/#/song?id=548648087', 'info_dict': { - 'id': '32102397', + 'id': '548648087', 'ext': 'mp3', - 'title': 'Bad Blood', - 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150516', - 'timestamp': 1431792000, - 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', + 'title': '戒烟 (Live)', + 'creator': '李荣浩 / 朱正廷 / 陈立农 / 尤长靖 / ONER灵超 / ONER木子洋 / 杨非同 / 陆定昊', + 'timestamp': 1522944000, + 'upload_date': '20180405', + 'description': 'md5:3650af9ee22c87e8637cb2dde22a765c', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + "duration": 256, + 'thumbnail': r're:^http.*\.jpg', }, }, { 'note': 'No lyrics.', @@ -176,21 +152,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': 'Opus 28', 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', - 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', 'timestamp': 1202745600, - }, - }, { - 'note': 'Has translated name.', - 'url': 'http://music.163.com/#/song?id=22735043', - 'info_dict': { - 'id': '22735043', - 'ext': 'mp3', - 'title': '소원을 말해봐 (Genie)', - 'creator': '少女时代', - 'description': 'md5:79d99cc560e4ca97e0c4d86800ee4184', - 'upload_date': '20100127', - 'timestamp': 1264608000, - 'alt_title': '说出愿望吧(Genie)', + 'duration': 263, + 'thumbnail': r're:^http.*\.jpg', }, }, { 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', @@ -203,59 +167,99 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'upload_date': '19911130', 'timestamp': 691516800, 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 268, + 'alt_title': '伴唱:现代人乐队 合唱:总政歌舞团', + 'thumbnail': r're:^http.*\.jpg', }, + }, { + 'url': 'http://music.163.com/#/song?id=32102397', + 'md5': '3e909614ce09b1ccef4a3eb205441190', + 'info_dict': { + 'id': '32102397', + 'ext': 'mp3', + 'title': 'Bad Blood', + 'creator': 'Taylor Swift / Kendrick Lamar', + 'upload_date': '20150516', + 'timestamp': 1431792000, + 'description': 'md5:21535156efb73d6d1c355f95616e285a', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 199, + 'thumbnail': r're:^http.*\.jpg', + }, + 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Has translated name.', + 'url': 'http://music.163.com/#/song?id=22735043', + 'info_dict': { + 'id': '22735043', + 'ext': 'mp3', + 'title': '소원을 말해봐 (Genie)', + 'creator': '少女时代', + 'upload_date': '20100127', + 'timestamp': 1264608000, + 'description': 'md5:03d1ffebec3139aa4bafe302369269c5', + 'subtitles': {'lyrics': [{'ext': 'lrc'}]}, + 'duration': 229, + 'alt_title': '说出愿望吧(Genie)', + 'thumbnail': r're:^http.*\.jpg', + }, + 'skip': 'Blocked outside Mainland China', }] def _process_lyrics(self, lyrics_info): - original = lyrics_info.get('lrc', {}).get('lyric') - translated = lyrics_info.get('tlyric', {}).get('lyric') + original = traverse_obj(lyrics_info, ('lrc', 'lyric', {str})) + translated = traverse_obj(lyrics_info, ('tlyric', 'lyric', {str})) + + if not original or original == '[99:00.00]纯音乐,请欣赏\n': + return None if not translated: - return original + return { + 'lyrics': [{'data': original, 'ext': 'lrc'}], + } lyrics_expr = r'(\[[0-9]{2}:[0-9]{2}\.[0-9]{2,}\])([^\n]+)' original_ts_texts = re.findall(lyrics_expr, original) - translation_ts_dict = dict( - (time_stamp, text) for time_stamp, text in re.findall(lyrics_expr, translated) - ) - lyrics = '\n'.join([ - '%s%s / %s' % (time_stamp, text, translation_ts_dict.get(time_stamp, '')) - for time_stamp, text in original_ts_texts - ]) - return lyrics + translation_ts_dict = dict(re.findall(lyrics_expr, translated)) + + merged = '\n'.join( + join_nonempty(f'{timestamp}{text}', translation_ts_dict.get(timestamp, ''), delim=' / ') + for timestamp, text in original_ts_texts) + + return { + 'lyrics_merged': [{'data': merged, 'ext': 'lrc'}], + 'lyrics': [{'data': original, 'ext': 'lrc'}], + 'lyrics_translated': [{'data': translated, 'ext': 'lrc'}], + } def _real_extract(self, url): song_id = self._match_id(url) - params = { - 'id': song_id, - 'ids': '[%s]' % song_id - } info = self.query_api( - 'song/detail?' + compat_urllib_parse_urlencode(params), - song_id, 'Downloading song info')['songs'][0] + f'song/detail?id={song_id}&ids=%5B{song_id}%5D', song_id, 'Downloading song info')['songs'][0] formats = self.extract_formats(info) - lyrics_info = self.query_api( - 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, - song_id, 'Downloading lyrics data') - lyrics = self._process_lyrics(lyrics_info) - - alt_title = None - if info.get('transNames'): - alt_title = '/'.join(info.get('transNames')) + lyrics = self._process_lyrics(self.query_api( + f'song/lyric?id={song_id}&lv=-1&tv=-1', song_id, 'Downloading lyrics data')) + lyric_data = { + 'description': traverse_obj(lyrics, (('lyrics_merged', 'lyrics'), 0, 'data'), get_all=False), + 'subtitles': lyrics, + } if lyrics else {} return { 'id': song_id, - 'title': info['name'], - 'alt_title': alt_title, - 'creator': ' / '.join([artist['name'] for artist in info.get('artists', [])]), - 'timestamp': self.convert_milliseconds(info.get('album', {}).get('publishTime')), - 'thumbnail': info.get('album', {}).get('picUrl'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), - 'description': lyrics, 'formats': formats, + 'alt_title': '/'.join(traverse_obj(info, (('transNames', 'alias'), ...))) or None, + 'creator': ' / '.join(traverse_obj(info, ('artists', ..., 'name'))) or None, + **lyric_data, + **traverse_obj(info, { + 'title': ('name', {str}), + 'timestamp': ('album', 'publishTime', {self.kilo_or_none}), + 'thumbnail': ('album', 'picUrl', {url_or_none}), + 'duration': ('duration', {self.kilo_or_none}), + }), } @@ -263,31 +267,44 @@ class NetEaseMusicAlbumIE(NetEaseMusicBaseIE): IE_NAME = 'netease:album' IE_DESC = '网易云音乐 - 专辑' _VALID_URL = r'https?://music\.163\.com/(#/)?album\?id=(?P[0-9]+)' - _TEST = { + _TESTS = [{ + 'url': 'https://music.163.com/#/album?id=133153666', + 'info_dict': { + 'id': '133153666', + 'title': '桃几的翻唱', + 'upload_date': '20210913', + 'description': '桃几2021年翻唱合集', + 'thumbnail': r're:^http.*\.jpg', + }, + 'playlist_mincount': 13, + }, { 'url': 'http://music.163.com/#/album?id=220780', 'info_dict': { 'id': '220780', - 'title': 'B\'day', + 'title': 'B\'Day', + 'upload_date': '20060904', + 'description': 'md5:71a74e1d8f392d88cf1bbe48879ad0b0', + 'thumbnail': r're:^http.*\.jpg', }, 'playlist_count': 23, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): album_id = self._match_id(url) + webpage = self._download_webpage(f'https://music.163.com/album?id={album_id}', album_id) - info = self.query_api( - 'album/%s?id=%s' % (album_id, album_id), - album_id, 'Downloading album data')['album'] - - name = info['name'] - desc = info.get('description') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['songs'] - ] - return self.playlist_result(entries, album_id, name, desc) + songs = self._search_json( + r']+\bid="song-list-pre-data"[^>]*>', webpage, 'metainfo', album_id, + end_pattern=r'', contains_pattern=r'\[(?s:.+)\]') + metainfo = { + 'title': self._og_search_property('title', webpage, 'title', fatal=False), + 'description': self._html_search_regex( + (rf']+\bid="album-desc-{suffix}"[^>]*>(.*?)' for suffix in ('more', 'dot')), + webpage, 'description', flags=re.S, fatal=False), + 'thumbnail': self._og_search_property('image', webpage, 'thumbnail', fatal=False), + 'upload_date': unified_strdate(self._html_search_meta('music:release_date', webpage, 'date', fatal=False)), + } + return self.playlist_result(self._get_entries(songs), album_id, **metainfo) class NetEaseMusicSingerIE(NetEaseMusicBaseIE): @@ -299,10 +316,9 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'url': 'http://music.163.com/#/artist?id=10559', 'info_dict': { 'id': '10559', - 'title': '张惠妹 - aMEI;阿密特', + 'title': '张惠妹 - aMEI;阿妹;阿密特', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Singer has translated name.', 'url': 'http://music.163.com/#/artist?id=124098', @@ -311,28 +327,28 @@ class NetEaseMusicSingerIE(NetEaseMusicBaseIE): 'title': '李昇基 - 이승기', }, 'playlist_count': 50, - 'skip': 'Blocked outside Mainland China', + }, { + 'note': 'Singer with both translated and alias', + 'url': 'https://music.163.com/#/artist?id=159692', + 'info_dict': { + 'id': '159692', + 'title': '初音ミク - 初音未来;Hatsune Miku', + }, + 'playlist_count': 50, }] def _real_extract(self, url): singer_id = self._match_id(url) info = self.query_api( - 'artist/%s?id=%s' % (singer_id, singer_id), - singer_id, 'Downloading singer data') + f'artist/{singer_id}?id={singer_id}', singer_id, note='Downloading singer data') - name = info['artist']['name'] - if info['artist']['trans']: - name = '%s - %s' % (name, info['artist']['trans']) - if info['artist']['alias']: - name = '%s - %s' % (name, ';'.join(info['artist']['alias'])) + name = join_nonempty( + traverse_obj(info, ('artist', 'name', {str})), + join_nonempty(*traverse_obj(info, ('artist', ('trans', ('alias', ...)), {str})), delim=';'), + delim=' - ') - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['hotSongs'] - ] - return self.playlist_result(entries, singer_id, name) + return self.playlist_result(self._get_entries(info, 'hotSongs'), singer_id, name) class NetEaseMusicListIE(NetEaseMusicBaseIE): @@ -344,10 +360,28 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): 'info_dict': { 'id': '79177352', 'title': 'Billboard 2007 Top 100', - 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022' + 'description': 'md5:12fd0819cab2965b9583ace0f8b7b022', + 'tags': ['欧美'], + 'uploader': '浑然破灭', + 'uploader_id': '67549805', + 'timestamp': int, + 'upload_date': r're:\d{8}', }, - 'playlist_count': 99, - 'skip': 'Blocked outside Mainland China', + 'playlist_mincount': 95, + }, { + 'note': 'Toplist/Charts sample', + 'url': 'https://music.163.com/#/discover/toplist?id=60198', + 'info_dict': { + 'id': '60198', + 'title': 're:美国Billboard榜 [0-9]{4}-[0-9]{2}-[0-9]{2}', + 'description': '美国Billboard排行榜', + 'tags': ['流行', '欧美', '榜单'], + 'uploader': 'Billboard公告牌', + 'uploader_id': '48171', + 'timestamp': int, + 'upload_date': r're:\d{8}', + }, + 'playlist_count': 100, }, { 'note': 'Toplist/Charts sample', 'url': 'http://music.163.com/#/discover/toplist?id=3733003', @@ -363,64 +397,86 @@ class NetEaseMusicListIE(NetEaseMusicBaseIE): def _real_extract(self, url): list_id = self._match_id(url) - info = self.query_api( - 'playlist/detail?id=%s&lv=-1&tv=-1' % list_id, - list_id, 'Downloading playlist data')['result'] + info = self._download_eapi_json( + '/v3/playlist/detail', list_id, + {'id': list_id, 't': '-1', 'n': '500', 's': '0'}, + note="Downloading playlist info") - name = info['name'] - desc = info.get('description') + metainfo = traverse_obj(info, ('playlist', { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'tags': ('tags', ..., {str}), + 'uploader': ('creator', 'nickname', {str}), + 'uploader_id': ('creator', 'userId', {str_or_none}), + 'timestamp': ('updateTime', {self.kilo_or_none}), + })) + if traverse_obj(info, ('playlist', 'specialType')) == 10: + metainfo['title'] = f'{metainfo.get("title")} {strftime_or_none(metainfo.get("timestamp"), "%Y-%m-%d")}' - if info.get('specialType') == 10: # is a chart/toplist - datestamp = datetime.fromtimestamp( - self.convert_milliseconds(info['updateTime'])).strftime('%Y-%m-%d') - name = '%s %s' % (name, datestamp) - - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song['id'], - 'NetEaseMusic', song['id']) - for song in info['tracks'] - ] - return self.playlist_result(entries, list_id, name, desc) + return self.playlist_result(self._get_entries(info, ('playlist', 'tracks')), list_id, **metainfo) class NetEaseMusicMvIE(NetEaseMusicBaseIE): IE_NAME = 'netease:mv' IE_DESC = '网易云音乐 - MV' _VALID_URL = r'https?://music\.163\.com/(#/)?mv\?id=(?P[0-9]+)' - _TEST = { + _TESTS = [{ + 'url': 'https://music.163.com/#/mv?id=10958064', + 'info_dict': { + 'id': '10958064', + 'ext': 'mp4', + 'title': '交换余生', + 'description': 'md5:e845872cff28820642a2b02eda428fea', + 'creator': '林俊杰', + 'upload_date': '20200916', + 'thumbnail': r're:http.*\.jpg', + 'duration': 364, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + }, { 'url': 'http://music.163.com/#/mv?id=415350', 'info_dict': { 'id': '415350', 'ext': 'mp4', 'title': '이럴거면 그러지말지', 'description': '白雅言自作曲唱甜蜜爱情', - 'creator': '白雅言', + 'creator': '白娥娟', 'upload_date': '20150520', + 'thumbnail': r're:http.*\.jpg', + 'duration': 216, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'skip': 'Blocked outside Mainland China', - } + }] def _real_extract(self, url): mv_id = self._match_id(url) info = self.query_api( - 'mv/detail?id=%s&type=mp4' % mv_id, - mv_id, 'Downloading mv info')['data'] + f'mv/detail?id={mv_id}&type=mp4', mv_id, 'Downloading mv info')['data'] formats = [ - {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} + {'url': mv_url, 'ext': 'mp4', 'format_id': f'{brs}p', 'height': int_or_none(brs)} for brs, mv_url in info['brs'].items() ] return { 'id': mv_id, - 'title': info['name'], - 'description': info.get('desc') or info.get('briefDesc'), - 'creator': info['artistName'], - 'upload_date': info['publishTime'].replace('-', ''), 'formats': formats, - 'thumbnail': info.get('cover'), - 'duration': self.convert_milliseconds(info.get('duration', 0)), + **traverse_obj(info, { + 'title': ('name', {str}), + 'description': (('desc', 'briefDesc'), {str}, {lambda x: x or None}), + 'creator': ('artistName', {str}), + 'upload_date': ('publishTime', {unified_strdate}), + 'thumbnail': ('cover', {url_or_none}), + 'duration': ('duration', {self.kilo_or_none}), + 'view_count': ('playCount', {int_or_none}), + 'like_count': ('likeCount', {int_or_none}), + 'comment_count': ('commentCount', {int_or_none}), + }, get_all=False), } @@ -431,75 +487,74 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): _TESTS = [{ 'url': 'http://music.163.com/#/program?id=10109055', 'info_dict': { - 'id': '10109055', + 'id': '32593346', 'ext': 'mp3', 'title': '不丹足球背后的故事', 'description': '喜马拉雅人的足球梦 ...', 'creator': '大话西藏', - 'timestamp': 1434179342, + 'timestamp': 1434179287, 'upload_date': '20150613', + 'thumbnail': r're:http.*\.jpg', 'duration': 900, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { 'id': '10141022', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', + 'creator': '滚滚电台ORZ', + 'timestamp': 1434450733, + 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', }, 'playlist_count': 4, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'This program has accompanying songs.', 'url': 'http://music.163.com/#/program?id=10141022', 'info_dict': { - 'id': '10141022', + 'id': '32647209', 'ext': 'mp3', - 'title': '25岁,你是自在如风的少年<27°C>', + 'title': '滚滚电台的有声节目', 'description': 'md5:8d594db46cc3e6509107ede70a4aaa3b', - 'timestamp': 1434450841, + 'creator': '滚滚电台ORZ', + 'timestamp': 1434450733, 'upload_date': '20150616', + 'thumbnail': r're:http.*\.jpg', + 'duration': 1104, }, 'params': { 'noplaylist': True }, - 'skip': 'Blocked outside Mainland China', }] def _real_extract(self, url): program_id = self._match_id(url) info = self.query_api( - 'dj/program/detail?id=%s' % program_id, - program_id, 'Downloading program info')['program'] + f'dj/program/detail?id={program_id}', program_id, note='Downloading program info')['program'] - name = info['name'] - description = info['description'] + metainfo = traverse_obj(info, { + 'title': ('name', {str}), + 'description': ('description', {str}), + 'creator': ('dj', 'brand', {str}), + 'thumbnail': ('coverUrl', {url_or_none}), + 'timestamp': ('createTime', {self.kilo_or_none}), + }) if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): formats = self.extract_formats(info['mainSong']) return { - 'id': info['mainSong']['id'], - 'title': name, - 'description': description, - 'creator': info['dj']['brand'], - 'timestamp': self.convert_milliseconds(info['createTime']), - 'thumbnail': info['coverUrl'], - 'duration': self.convert_milliseconds(info.get('duration', 0)), + 'id': str(info['mainSong']['id']), 'formats': formats, + 'duration': traverse_obj(info, ('mainSong', 'duration', {self.kilo_or_none})), + **metainfo, } - song_ids = [info['mainSong']['id']] - song_ids.extend([song['id'] for song in info['songs']]) - entries = [ - self.url_result('http://music.163.com/#/song?id=%s' % song_id, - 'NetEaseMusic', song_id) - for song_id in song_ids - ] - return self.playlist_result(entries, program_id, name, description) + songs = traverse_obj(info, (('mainSong', ('songs', ...)),)) + return self.playlist_result(self._get_entries(songs), program_id, **metainfo) class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): @@ -511,38 +566,32 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): 'info_dict': { 'id': '42', 'title': '声音蔓延', - 'description': 'md5:766220985cbd16fdd552f64c578a6b15' + 'description': 'md5:c7381ebd7989f9f367668a5aee7d5f08' }, 'playlist_mincount': 40, - 'skip': 'Blocked outside Mainland China', } _PAGE_SIZE = 1000 def _real_extract(self, url): dj_id = self._match_id(url) - name = None - desc = None + metainfo = {} entries = [] for offset in itertools.count(start=0, step=self._PAGE_SIZE): info = self.query_api( - 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' - % (self._PAGE_SIZE, dj_id, offset), - dj_id, 'Downloading dj programs - %d' % offset) + f'dj/program/byradio?asc=false&limit={self._PAGE_SIZE}&radioId={dj_id}&offset={offset}', + dj_id, note=f'Downloading dj programs - {offset}') - entries.extend([ - self.url_result( - 'http://music.163.com/#/program?id=%s' % program['id'], - 'NetEaseMusicProgram', program['id']) - for program in info['programs'] - ]) - - if name is None: - radio = info['programs'][0]['radio'] - name = radio['name'] - desc = radio['desc'] + entries.extend(self.url_result( + f'http://music.163.com/#/program?id={program["id"]}', NetEaseMusicProgramIE, + program['id'], program.get('name')) for program in info['programs']) + if not metainfo: + metainfo = traverse_obj(info, ('programs', 0, 'radio', { + 'title': ('name', {str}), + 'description': ('desc', {str}), + })) if not info['more']: break - return self.playlist_result(entries, dj_id, name, desc) + return self.playlist_result(entries, dj_id, **metainfo) From a9efb4b8d74f3583450ffda0ee57259a47d39c70 Mon Sep 17 00:00:00 2001 From: xofe <22776566+xofe@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:35:11 +0000 Subject: [PATCH 203/218] [ie/abc.net.au:iview] Improve `episode` extraction (#8201) Authored by: xofe --- yt_dlp/extractor/abc.py | 90 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/abc.py b/yt_dlp/extractor/abc.py index d2cf5f7c5..9d527246a 100644 --- a/yt_dlp/extractor/abc.py +++ b/yt_dlp/extractor/abc.py @@ -181,18 +181,102 @@ class ABCIViewIE(InfoExtractor): _GEO_COUNTRIES = ['AU'] _TESTS = [{ + 'url': 'https://iview.abc.net.au/show/utopia/series/1/video/CO1211V001S00', + 'md5': '52a942bfd7a0b79a6bfe9b4ce6c9d0ed', + 'info_dict': { + 'id': 'CO1211V001S00', + 'ext': 'mp4', + 'title': 'Series 1 Ep 1 Wood For The Trees', + 'series': 'Utopia', + 'description': 'md5:0cfb2c183c1b952d1548fd65c8a95c00', + 'upload_date': '20230726', + 'uploader_id': 'abc1', + 'series_id': 'CO1211V', + 'episode_id': 'CO1211V001S00', + 'season_number': 1, + 'season': 'Season 1', + 'episode_number': 1, + 'episode': 'Wood For The Trees', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/co/CO1211V001S00_5ad8353f4df09_1280.jpg', + 'timestamp': 1690403700, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode name', 'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00', 'md5': '67715ce3c78426b11ba167d875ac6abf', 'info_dict': { 'id': 'LE1927H001S00', 'ext': 'mp4', - 'title': "Series 11 Ep 1", - 'series': "Gruen", + 'title': 'Series 11 Ep 1', + 'series': 'Gruen', 'description': 'md5:52cc744ad35045baf6aded2ce7287f67', 'upload_date': '20190925', 'uploader_id': 'abc1', + 'series_id': 'LE1927H', + 'episode_id': 'LE1927H001S00', + 'season_number': 11, + 'season': 'Season 11', + 'episode_number': 1, + 'episode': 'Episode 1', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/le/LE1927H001S00_5d954fbd79e25_1280.jpg', 'timestamp': 1569445289, }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode number', + 'url': 'https://iview.abc.net.au/show/four-corners/series/2022/video/NC2203H039S00', + 'md5': '77cb7d8434440e3b28fbebe331c2456a', + 'info_dict': { + 'id': 'NC2203H039S00', + 'ext': 'mp4', + 'title': 'Series 2022 Locking Up Kids', + 'series': 'Four Corners', + 'description': 'md5:54829ca108846d1a70e1fcce2853e720', + 'upload_date': '20221114', + 'uploader_id': 'abc1', + 'series_id': 'NC2203H', + 'episode_id': 'NC2203H039S00', + 'season_number': 2022, + 'season': 'Season 2022', + 'episode_number': None, + 'episode': 'Locking Up Kids', + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/nc/NC2203H039S00_636d8a0944a22_1920.jpg', + 'timestamp': 1668460497, + + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], + 'params': { + 'skip_download': True, + }, + }, { + 'note': 'No episode name or number', + 'url': 'https://iview.abc.net.au/show/landline/series/2021/video/RF2004Q043S00', + 'md5': '2e17dec06b13cc81dc119d2565289396', + 'info_dict': { + 'id': 'RF2004Q043S00', + 'ext': 'mp4', + 'title': 'Series 2021', + 'series': 'Landline', + 'description': 'md5:c9f30d9c0c914a7fd23842f6240be014', + 'upload_date': '20211205', + 'uploader_id': 'abc1', + 'series_id': 'RF2004Q', + 'episode_id': 'RF2004Q043S00', + 'season_number': 2021, + 'season': 'Season 2021', + 'episode_number': None, + 'episode': None, + 'thumbnail': 'https://cdn.iview.abc.net.au/thumbs/i/rf/RF2004Q043S00_61a950639dbc0_1920.jpg', + 'timestamp': 1638710705, + + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], 'params': { 'skip_download': True, }, @@ -254,6 +338,8 @@ def tokenize_url(url, token): 'episode_number': int_or_none(self._search_regex( r'\bEp\s+(\d+)\b', title, 'episode number', default=None)), 'episode_id': house_number, + 'episode': self._search_regex( + r'^(?:Series\s+\d+)?\s*(?:Ep\s+\d+)?\s*(.*)$', title, 'episode', default='') or None, 'uploader_id': video_params.get('channel'), 'formats': formats, 'subtitles': subtitles, From 48cceec1ddb8649b5e771df8df79eb9c39c82b90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 6 Oct 2023 19:38:26 -0300 Subject: [PATCH 204/218] [ie/lbry] Add playlist support (#8213) Closes #5982, Closes #8204 Authored by: drzraf, bashonly, Grub4K --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/lbry.py | 184 ++++++++++++++++++++------------ 2 files changed, 116 insertions(+), 69 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 908abb8ac..ef6123e8a 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -951,6 +951,7 @@ from .lbry import ( LBRYIE, LBRYChannelIE, + LBRYPlaylistIE, ) from .lci import LCIIE from .lcp import ( diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 9a9f9256f..ccce300b5 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -22,10 +22,11 @@ class LBRYBaseIE(InfoExtractor): - _BASE_URL_REGEX = r'(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' + _BASE_URL_REGEX = r'(?x)(?:https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/|lbry://)' _CLAIM_ID_REGEX = r'[0-9a-f]{1,40}' - _OPT_CLAIM_ID = '[^:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX + _OPT_CLAIM_ID = '[^$@:/?#&]+(?:[:#]%s)?' % _CLAIM_ID_REGEX _SUPPORTED_STREAM_TYPES = ['video', 'audio'] + _PAGE_SIZE = 50 def _call_api_proxy(self, method, display_id, params, resource): headers = {'Content-Type': 'application/json-rpc'} @@ -77,10 +78,70 @@ def _parse_stream(self, stream, url): return info + def _fetch_page(self, display_id, url, params, page): + page += 1 + page_params = { + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + **params, + } + result = self._call_api_proxy( + 'claim_search', display_id, page_params, f'page {page}') + for item in traverse_obj(result, ('items', lambda _, v: v['name'] and v['claim_id'])): + yield { + **self._parse_stream(item, url), + '_type': 'url', + 'id': item['claim_id'], + 'url': self._permanent_url(url, item['name'], item['claim_id']), + } + + def _playlist_entries(self, url, display_id, claim_param, metadata): + qs = parse_qs(url) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'claim_type': 'stream', + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + **claim_param, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages + + entries = OnDemandPagedList( + functools.partial(self._fetch_page, display_id, url, params), + self._PAGE_SIZE) + + return self.playlist_result( + entries, display_id, **traverse_obj(metadata, ('value', { + 'title': 'title', + 'description': 'description', + }))) + class LBRYIE(LBRYBaseIE): IE_NAME = 'lbry' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P\$/[^/]+/[^/]+/{1}|@{0}/{0}|(?!@){0})'.format(LBRYBaseIE._OPT_CLAIM_ID, LBRYBaseIE._CLAIM_ID_REGEX) + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf''' + (?:\$/(?:download|embed)/)? + (?P + [^$@:/?#]+/{LBRYBaseIE._CLAIM_ID_REGEX} + |(?:@{LBRYBaseIE._OPT_CLAIM_ID}/)?{LBRYBaseIE._OPT_CLAIM_ID} + )''' _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', @@ -149,7 +210,7 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', + 'formats': 'mincount:3', # FIXME 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', } @@ -184,12 +245,12 @@ class LBRYIE(LBRYBaseIE): 'id': '41fbfe805eb73c8d3012c0c49faa0f563274f634', 'ext': 'mp4', 'title': 'Biotechnological Invasion of Skin (April 2023)', - 'description': 'md5:709a2f4c07bd8891cda3a7cc2d6fcf5c', + 'description': 'md5:fe28689db2cb7ba3436d819ac3ffc378', 'channel': 'Wicked Truths', 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', - 'timestamp': 1685790036, - 'upload_date': '20230603', + 'timestamp': 1695114347, + 'upload_date': '20230919', 'release_timestamp': 1685617473, 'release_date': '20230601', 'duration': 1063, @@ -229,10 +290,10 @@ class LBRYIE(LBRYBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - if display_id.startswith('$/'): - display_id = display_id.split('/', 2)[-1].replace('/', ':') - else: + if display_id.startswith('@'): display_id = display_id.replace(':', '#') + else: + display_id = display_id.replace('/', ':') display_id = urllib.parse.unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') @@ -299,7 +360,7 @@ def _real_extract(self, url): class LBRYChannelIE(LBRYBaseIE): IE_NAME = 'lbry:channel' - _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'(?P@%s)/?(?:[?&]|$)' % LBRYBaseIE._OPT_CLAIM_ID + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + rf'(?P@{LBRYBaseIE._OPT_CLAIM_ID})/?(?:[?&]|$)' _TESTS = [{ 'url': 'https://lbry.tv/@LBRYFoundation:0', 'info_dict': { @@ -315,65 +376,50 @@ class LBRYChannelIE(LBRYBaseIE): 'url': 'lbry://@lbry#3f', 'only_matching': True, }] - _PAGE_SIZE = 50 - - def _fetch_page(self, claim_id, url, params, page): - page += 1 - page_params = { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - } - page_params.update(params) - result = self._call_api_proxy( - 'claim_search', claim_id, page_params, 'page %d' % page) - for item in (result.get('items') or []): - stream_claim_name = item.get('name') - stream_claim_id = item.get('claim_id') - if not (stream_claim_name and stream_claim_id): - continue - - yield { - **self._parse_stream(item, url), - '_type': 'url', - 'id': stream_claim_id, - 'url': self._permanent_url(url, stream_claim_name, stream_claim_id), - } def _real_extract(self, url): display_id = self._match_id(url).replace(':', '#') - result = self._resolve_url( - 'lbry://' + display_id, display_id, 'channel') + result = self._resolve_url(f'lbry://{display_id}', display_id, 'channel') claim_id = result['claim_id'] - qs = parse_qs(url) - content = qs.get('content', [None])[0] - params = { - 'fee_amount': qs.get('fee_amount', ['>=0'])[0], - 'order_by': { - 'new': ['release_time'], - 'top': ['effective_amount'], - 'trending': ['trending_group', 'trending_mixed'], - }[qs.get('order', ['new'])[0]], - 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, - } - duration = qs.get('duration', [None])[0] - if duration: - params['duration'] = { - 'long': '>=1200', - 'short': '<=240', - }[duration] - language = qs.get('language', ['all'])[0] - if language != 'all': - languages = [language] - if language == 'en': - languages.append('none') - params['any_languages'] = languages - entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url, params), - self._PAGE_SIZE) - result_value = result.get('value') or {} - return self.playlist_result( - entries, claim_id, result_value.get('title'), - result_value.get('description')) + + return self._playlist_entries(url, claim_id, {'channel_ids': [claim_id]}, result) + + +class LBRYPlaylistIE(LBRYBaseIE): + IE_NAME = 'lbry:playlist' + _VALID_URL = LBRYBaseIE._BASE_URL_REGEX + r'\$/(?:play)?list/(?P[0-9a-f-]+)' + _TESTS = [{ + 'url': 'https://odysee.com/$/playlist/ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'info_dict': { + 'id': 'ffef782f27486f0ac138bde8777f72ebdd0548c2', + 'title': 'Théâtre Classique', + 'description': 'Théâtre Classique', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://odysee.com/$/list/9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'info_dict': { + 'id': '9c6658b3dd21e4f2a0602d523a13150e2b48b770', + 'title': 'Social Media Exposed', + 'description': 'md5:98af97317aacd5b85d595775ea37d80e', + }, + 'playlist_mincount': 34, + }, { + 'url': 'https://odysee.com/$/playlist/938fb11d-215f-4d1c-ad64-723954df2184', + 'info_dict': { + 'id': '938fb11d-215f-4d1c-ad64-723954df2184', + }, + 'playlist_mincount': 1000, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + result = traverse_obj(self._call_api_proxy('claim_search', display_id, { + 'claim_ids': [display_id], + 'no_totals': True, + 'page': 1, + 'page_size': self._PAGE_SIZE, + }, 'playlist'), ('items', 0)) + claim_param = {'claim_ids': traverse_obj(result, ('value', 'claims', ..., {str}))} + + return self._playlist_entries(url, display_id, claim_param, result) From fbcc299bd8a19cf8b3c8805d6c268a9110230973 Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Sat, 7 Oct 2023 01:45:46 +0300 Subject: [PATCH 205/218] [ie/substack] Fix embed extraction (#8218) Authored by: handlerug --- yt_dlp/extractor/substack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 3782ceed1..5835a5a8d 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -50,7 +50,7 @@ def _extract_embed_urls(cls, url, webpage): if not re.search(r']+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): return - mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P[^"]+)', webpage) + mobj = re.search(r'{[^}]*\\?["\']subdomain\\?["\']\s*:\s*\\?["\'](?P[^\\"\']+)', webpage) if mobj: parsed = urllib.parse.urlparse(url) yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() From 2f2dda3a7e85148773da3cdbc03ac9949ec1bc45 Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Sat, 7 Oct 2023 01:48:54 +0300 Subject: [PATCH 206/218] [ie/substack] Fix download cookies bug (#8219) Authored by: handlerug --- yt_dlp/extractor/substack.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/substack.py b/yt_dlp/extractor/substack.py index 5835a5a8d..6ee3f75e1 100644 --- a/yt_dlp/extractor/substack.py +++ b/yt_dlp/extractor/substack.py @@ -56,10 +56,10 @@ def _extract_embed_urls(cls, url, webpage): yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() raise cls.StopExtraction() - def _extract_video_formats(self, video_id, username): + def _extract_video_formats(self, video_id, url): formats, subtitles = [], {} for video_format in ('hls', 'mp4'): - video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}' + video_url = urllib.parse.urljoin(url, f'/api/v1/video/upload/{video_id}/src?type={video_format}') if video_format == 'hls': fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False) @@ -81,12 +81,17 @@ def _real_extract(self, url): r'window\._preloads\s*=\s*JSON\.parse\(', webpage, 'json string', display_id, transform_source=js_to_json, contains_pattern=r'"{(?s:.+)}"'), display_id) + canonical_url = url + domain = traverse_obj(webpage_info, ('domainInfo', 'customDomain', {str})) + if domain: + canonical_url = urllib.parse.urlparse(url)._replace(netloc=domain).geturl() + post_type = webpage_info['post']['type'] formats, subtitles = [], {} if post_type == 'podcast': formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} elif post_type == 'video': - formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username) + formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], canonical_url) else: self.raise_no_formats(f'Page type "{post_type}" is not supported') @@ -99,4 +104,5 @@ def _real_extract(self, url): 'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')), 'uploader': traverse_obj(webpage_info, ('pub', 'name')), 'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))), + 'webpage_url': canonical_url, } From 2ad3873f0dfa9285c91d2160e36c039e69d597c7 Mon Sep 17 00:00:00 2001 From: garret Date: Fri, 6 Oct 2023 23:53:11 +0100 Subject: [PATCH 207/218] [ie/radiko] Improve extraction (#8221) Authored by: garret1317 --- yt_dlp/extractor/radiko.py | 67 ++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/yt_dlp/extractor/radiko.py b/yt_dlp/extractor/radiko.py index cef68eba0..8c8fb1a8f 100644 --- a/yt_dlp/extractor/radiko.py +++ b/yt_dlp/extractor/radiko.py @@ -1,4 +1,5 @@ import base64 +import random import urllib.parse from .common import InfoExtractor @@ -13,6 +14,7 @@ class RadikoBaseIE(InfoExtractor): + _GEO_BYPASS = False _FULL_KEY = None _HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED = ( 'https://c-rpaa.smartstream.ne.jp', @@ -32,7 +34,7 @@ class RadikoBaseIE(InfoExtractor): 'https://c-radiko.smartstream.ne.jp', ) - def _auth_client(self): + def _negotiate_token(self): _, auth1_handle = self._download_webpage_handle( 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', headers={ @@ -58,10 +60,23 @@ def _auth_client(self): 'x-radiko-partialkey': partial_key, }).split(',')[0] + if area_id == 'OUT': + self.raise_geo_restricted(countries=['JP']) + auth_data = (auth_token, area_id) self.cache.store('radiko', 'auth_data', auth_data) return auth_data + def _auth_client(self): + cachedata = self.cache.load('radiko', 'auth_data') + if cachedata is not None: + response = self._download_webpage( + 'https://radiko.jp/v2/api/auth_check', None, 'Checking cached token', expected_status=401, + headers={'X-Radiko-AuthToken': cachedata[0], 'X-Radiko-AreaId': cachedata[1]}) + if response == 'OK': + return cachedata + return self._negotiate_token() + def _extract_full_key(self): if self._FULL_KEY: return self._FULL_KEY @@ -75,7 +90,7 @@ def _extract_full_key(self): if full_key: full_key = full_key.encode() - else: # use full key ever known + else: # use only full key ever known full_key = b'bcd151073c03b352e1ef2fd66c32209da9ca0afa' self._FULL_KEY = full_key @@ -103,24 +118,24 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, m3u8_playlist_data = self._download_xml( f'https://radiko.jp/v3/station/stream/pc_html5/{station}.xml', video_id, note='Downloading stream information') - m3u8_urls = m3u8_playlist_data.findall('.//url') formats = [] found = set() - for url_tag in m3u8_urls: - pcu = url_tag.find('playlist_create_url').text - url_attrib = url_tag.attrib + + timefree_int = 0 if is_onair else 1 + + for element in m3u8_playlist_data.findall(f'.//url[@timefree="{timefree_int}"]/playlist_create_url'): + pcu = element.text + if pcu in found: + continue + found.add(pcu) playlist_url = update_url_query(pcu, { 'station_id': station, **query, 'l': '15', - 'lsid': '88ecea37e968c1f17d5413312d9f8003', + 'lsid': ''.join(random.choices('0123456789abcdef', k=32)), 'type': 'b', }) - if playlist_url in found: - continue - else: - found.add(playlist_url) time_to_skip = None if is_onair else cursor - ft @@ -138,7 +153,7 @@ def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, not is_onair and pcu.startswith(self._HOSTS_FOR_TIME_FREE_FFMPEG_UNSUPPORTED)): sf['preference'] = -100 sf['format_note'] = 'not preferred' - if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: + if not is_onair and timefree_int == 1 and time_to_skip: sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} formats.extend(subformats) @@ -166,21 +181,7 @@ def _real_extract(self, url): vid_int = unified_timestamp(video_id, False) prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) - auth_cache = self.cache.load('radiko', 'auth_data') - for attempt in range(2): - auth_token, area_id = (not attempt and auth_cache) or self._auth_client() - formats = self._extract_formats( - video_id=video_id, station=station, is_onair=False, - ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, - query={ - 'start_at': radio_begin, - 'ft': radio_begin, - 'end_at': radio_end, - 'to': radio_end, - 'seek': video_id, - }) - if formats: - break + auth_token, area_id = self._auth_client() return { 'id': video_id, @@ -189,8 +190,18 @@ def _real_extract(self, url): 'uploader': try_call(lambda: station_program.find('.//name').text), 'uploader_id': station, 'timestamp': vid_int, - 'formats': formats, 'is_live': True, + 'formats': self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id + } + ), } From 35d9cbaf9638ccc9daf8a863063b2e7c135bc664 Mon Sep 17 00:00:00 2001 From: AS6939 <46506352+AS6939@users.noreply.github.com> Date: Sat, 7 Oct 2023 06:56:12 +0800 Subject: [PATCH 208/218] [ie/iq.com] Fix extraction and subtitles (#8260) Closes #7734, Closes #8123 Authored by: AS6939 --- yt_dlp/extractor/iqiyi.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/iqiyi.py b/yt_dlp/extractor/iqiyi.py index fa602ba88..3368ab1d9 100644 --- a/yt_dlp/extractor/iqiyi.py +++ b/yt_dlp/extractor/iqiyi.py @@ -499,9 +499,10 @@ class IqIE(InfoExtractor): 'tm': tm, 'qdy': 'a', 'qds': 0, - 'k_ft1': 141287244169348, - 'k_ft4': 34359746564, - 'k_ft5': 1, + 'k_ft1': '143486267424900', + 'k_ft4': '1572868', + 'k_ft7': '4', + 'k_ft5': '1', 'bop': JSON.stringify({ 'version': '10.0', 'dfp': dfp @@ -529,14 +530,22 @@ def _extract_vms_player_js(self, webpage, video_id): webpack_js_url = self._proto_relative_url(self._search_regex( r'') + + return { + 'id': video_id, + 'formats': self._extract_m3u8_formats( + data.get('file_url') or data['stream_url'], video_id, 'm4a', m3u8_id='hls'), + 'age_limit': 18, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('description', {str}), + 'release_timestamp': ('created_at', {parse_iso8601}), + 'modified_timestamp': ('updated_at', {parse_iso8601}), + 'uploader': ('user', 'name', {str}), + 'uploader_id': ('user', 'id', {str_or_none}), + 'uploader_url': ('user', 'permalink_url', {url_or_none}), + 'thumbnail': ('artwork_url', {url_or_none}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('plays', {int_or_none}), + 'comment_count': ('comment_count', {int_or_none}), + 'webpage_url': ('permalink_url', {url_or_none}), + }), + } From 0e722f2f3ca42e634fd7b06ee70b16bf833ce132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Droz?= Date: Fri, 6 Oct 2023 19:59:42 -0300 Subject: [PATCH 210/218] [ie/lbry] Extract `uploader_id` (#8244) Closes #123 Authored by: drzraf --- yt_dlp/extractor/lbry.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index ccce300b5..cc37c41e8 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -70,11 +70,11 @@ def _parse_stream(self, stream, url): 'duration': ('value', stream_type, 'duration', {int_or_none}), 'channel': ('signing_channel', 'value', 'title', {str}), 'channel_id': ('signing_channel', 'claim_id', {str}), + 'uploader_id': ('signing_channel', 'name', {str}), }) - channel_name = traverse_obj(stream, ('signing_channel', 'name', {str})) - if channel_name and info.get('channel_id'): - info['channel_url'] = self._permanent_url(url, channel_name, info['channel_id']) + if info.get('uploader_id') and info.get('channel_id'): + info['channel_url'] = self._permanent_url(url, info['uploader_id'], info['channel_id']) return info @@ -159,6 +159,7 @@ class LBRYIE(LBRYBaseIE): 'height': 720, 'thumbnail': 'https://spee.ch/7/67f2d809c263288c.png', 'license': 'None', + 'uploader_id': '@Mantega', 'duration': 346, 'channel': 'LBRY/Odysee rats united!!!', 'channel_id': '1c8ad6a2ab4e889a71146ae4deeb23bb92dab627', @@ -192,6 +193,7 @@ class LBRYIE(LBRYBaseIE): 'vcodec': 'none', 'thumbnail': 'https://spee.ch/d/0bc63b0e6bf1492d.png', 'license': 'None', + 'uploader_id': '@LBRYFoundation', } }, { 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', @@ -210,7 +212,8 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Gardening In Canada', 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', - 'formats': 'mincount:3', # FIXME + 'uploader_id': '@gardeningincanada', + 'formats': 'mincount:3', 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', 'license': 'Copyrighted (contact publisher)', } @@ -235,6 +238,7 @@ class LBRYIE(LBRYBaseIE): 'formats': 'mincount:1', 'thumbnail': 'startswith:https://thumb', 'license': 'None', + 'uploader_id': '@RT', }, 'params': {'skip_download': True} }, { @@ -249,6 +253,7 @@ class LBRYIE(LBRYBaseIE): 'channel': 'Wicked Truths', 'channel_id': '23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', 'channel_url': 'https://odysee.com/@wickedtruths:23d2bbf856b0ceed5b1d7c5960bcc72da5a20cb0', + 'uploader_id': '@wickedtruths', 'timestamp': 1695114347, 'upload_date': '20230919', 'release_timestamp': 1685617473, From e831c80e8b2fc025b3b67d82974cc59e3526fdc8 Mon Sep 17 00:00:00 2001 From: garret Date: Sat, 7 Oct 2023 00:05:48 +0100 Subject: [PATCH 211/218] [ie/nhk] Fix VOD extraction (#8249) Closes #8242 Authored by: garret1317 --- yt_dlp/extractor/nhk.py | 46 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index fbd6a18f6..bcbc2279f 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -28,6 +28,44 @@ def _call_api(self, m_id, lang, is_video, is_episode, is_clip): m_id, lang, '/all' if is_video else ''), m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] + def _get_api_info(self, refresh=True): + if not refresh: + return self.cache.load('nhk', 'api_info') + + self.cache.store('nhk', 'api_info', {}) + movie_player_js = self._download_webpage( + 'https://movie-a.nhk.or.jp/world/player/js/movie-player.js', None, + note='Downloading stream API information') + api_info = { + 'url': self._search_regex( + r'prod:[^;]+\bapiUrl:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API url'), + 'token': self._search_regex( + r'prod:[^;]+\btoken:\s*[\'"]([^\'"]+)[\'"]', movie_player_js, None, 'stream API token'), + } + self.cache.store('nhk', 'api_info', api_info) + return api_info + + def _extract_formats_and_subtitles(self, vod_id): + for refresh in (False, True): + api_info = self._get_api_info(refresh) + if not api_info: + continue + + api_url = api_info.pop('url') + stream_url = traverse_obj( + self._download_json( + api_url, vod_id, 'Downloading stream url info', fatal=False, query={ + **api_info, + 'type': 'json', + 'optional_id': vod_id, + 'active_flg': 1, + }), + ('meta', 0, 'movie_url', ('mb_auto', 'auto_sp', 'auto_pc'), {url_or_none}), get_all=False) + if stream_url: + return self._extract_m3u8_formats_and_subtitles(stream_url, vod_id) + + raise ExtractorError('Unable to extract stream url') + def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() @@ -67,12 +105,14 @@ def get_clean_field(key): } if is_video: vod_id = episode['vod_id'] + formats, subs = self._extract_formats_and_subtitles(vod_id) + info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://movie-s.nhk.or.jp/v/refid/nhkworld/prefid/' + vod_id, 'id': vod_id, + 'formats': formats, + 'subtitles': subs, }) + else: if fetch_episode: audio_path = episode['audio']['audio'] From 19c90e405b4137c06dfe6f9aaa02396df0da93e5 Mon Sep 17 00:00:00 2001 From: trainman261 Date: Sat, 7 Oct 2023 01:56:19 +0200 Subject: [PATCH 212/218] [cleanup] Update extractor tests (#7718) Authored by: trainman261 --- yt_dlp/extractor/aenetworks.py | 1 + yt_dlp/extractor/amcnetworks.py | 1 + yt_dlp/extractor/cbc.py | 7 ++++++- yt_dlp/extractor/cbs.py | 2 ++ yt_dlp/extractor/cnbc.py | 2 ++ yt_dlp/extractor/corus.py | 3 ++- yt_dlp/extractor/generic.py | 13 ++++++++++--- yt_dlp/extractor/mediaset.py | 3 ++- yt_dlp/extractor/movieclips.py | 1 + yt_dlp/extractor/nationalgeographic.py | 3 +++ yt_dlp/extractor/nbc.py | 22 +++++++++++++++++----- yt_dlp/extractor/scrippsnetworks.py | 4 ++++ yt_dlp/extractor/syfy.py | 1 + yt_dlp/extractor/theplatform.py | 6 +++--- yt_dlp/extractor/theweatherchannel.py | 20 +++++++++++--------- 15 files changed, 66 insertions(+), 23 deletions(-) diff --git a/yt_dlp/extractor/aenetworks.py b/yt_dlp/extractor/aenetworks.py index f049a0fb3..cc26653c1 100644 --- a/yt_dlp/extractor/aenetworks.py +++ b/yt_dlp/extractor/aenetworks.py @@ -338,6 +338,7 @@ class BiographyIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': '404 Not Found', }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/amcnetworks.py b/yt_dlp/extractor/amcnetworks.py index c58bc7bfb..10bd021c5 100644 --- a/yt_dlp/extractor/amcnetworks.py +++ b/yt_dlp/extractor/amcnetworks.py @@ -26,6 +26,7 @@ class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE # m3u8 download 'skip_download': True, }, + 'skip': '404 Not Found', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 2920b9027..be2d13e44 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -66,6 +66,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', 'timestamp': 255977160, }, + 'skip': '404 Not Found', }, { # multiple iframes 'url': 'http://www.cbc.ca/natureofthings/blog/birds-eye-view-from-vancouvers-burrard-street-bridge-how-we-got-the-shot', @@ -97,7 +98,7 @@ class CBCIE(InfoExtractor): # multiple CBC.APP.Caffeine.initInstance(...) 'url': 'http://www.cbc.ca/news/canada/calgary/dog-indoor-exercise-winter-1.3928238', 'info_dict': { - 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', + 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', # FIXME 'id': 'dog-indoor-exercise-winter-1.3928238', 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, @@ -476,6 +477,10 @@ class CBCGemPlaylistIE(InfoExtractor): 'id': 'schitts-creek/s06', 'title': 'Season 6', 'description': 'md5:6a92104a56cbeb5818cc47884d4326a2', + 'series': 'Schitt\'s Creek', + 'season_number': 6, + 'season': 'Season 6', + 'thumbnail': 'https://images.radio-canada.ca/v1/synps-cbc/season/perso/cbc_schitts_creek_season_06_carousel_v03.jpg?impolicy=ott&im=Resize=(_Size_)&quality=75', }, }, { 'url': 'https://gem.cbc.ca/schitts-creek/s06', diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 1c0dbdea9..d97fbd758 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -101,6 +101,7 @@ class CBSIE(CBSBaseIE): # m3u8 download 'skip_download': True, }, + 'skip': 'Subscription required', }, { 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { @@ -117,6 +118,7 @@ class CBSIE(CBSBaseIE): }, 'expected_warnings': [ 'This content expired on', 'No video formats found', 'Requested format is not available'], + 'skip': '404 Not Found', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', 'only_matching': True, diff --git a/yt_dlp/extractor/cnbc.py b/yt_dlp/extractor/cnbc.py index 68fd025b7..7d209b6d9 100644 --- a/yt_dlp/extractor/cnbc.py +++ b/yt_dlp/extractor/cnbc.py @@ -19,6 +19,7 @@ class CNBCIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'Dead link', } def _real_extract(self, url): @@ -49,6 +50,7 @@ class CNBCVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Dead link', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py index c03d65310..bcc34ddd8 100644 --- a/yt_dlp/extractor/corus.py +++ b/yt_dlp/extractor/corus.py @@ -41,7 +41,7 @@ class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE ) ''' _TESTS = [{ - 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', + 'url': 'https://www.hgtv.ca/video/bryan-inc/movie-night-popcorn-with-bryan/870923331648/', 'info_dict': { 'id': '870923331648', 'ext': 'mp4', @@ -54,6 +54,7 @@ class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], + # FIXME: yt-dlp wrongly raises for geo restriction }, { 'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753', 'only_matching': True, diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 33e71d1c5..5e1240c13 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -58,6 +58,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', + 'direct': True, + 'timestamp': 1273772943.0, } }, # Direct link to media delivered compressed (until Accept-Encoding is *) @@ -101,6 +103,8 @@ class GenericIE(InfoExtractor): 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', + 'direct': True, + 'timestamp': 1416498816.0, }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' @@ -133,6 +137,7 @@ class GenericIE(InfoExtractor): 'upload_date': '20201204', }, }], + 'skip': 'Dead link', }, # RSS feed with item with description and thumbnails { @@ -145,12 +150,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'm4a', - 'id': 'c1c879525ce2cb640b344507e682c36d', + 'id': '818a5d38-01cd-152f-2231-ee479677fa82', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': 459, + 'duration': 423, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, @@ -267,6 +272,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404 Not Found', }, # MPD from http://dash-mse-test.appspot.com/media.html { @@ -278,6 +284,7 @@ class GenericIE(InfoExtractor): 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', + 'timestamp': 1378272859.0, }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 @@ -318,7 +325,7 @@ class GenericIE(InfoExtractor): 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', - 'uploader_id': 'TheVerge', + 'uploader_id': '@TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py index e3b728dca..2d6204298 100644 --- a/yt_dlp/extractor/mediaset.py +++ b/yt_dlp/extractor/mediaset.py @@ -127,7 +127,8 @@ class MediasetIE(ThePlatformBaseIE): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Dead link', }, { # WittyTV embed 'url': 'https://www.wittytv.it/mauriziocostanzoshow/ultima-puntata-venerdi-25-novembre/', diff --git a/yt_dlp/extractor/movieclips.py b/yt_dlp/extractor/movieclips.py index 4777f440e..f7f2921fd 100644 --- a/yt_dlp/extractor/movieclips.py +++ b/yt_dlp/extractor/movieclips.py @@ -23,6 +23,7 @@ class MovieClipsIE(InfoExtractor): 'uploader': 'Movieclips', }, 'add_ie': ['ThePlatform'], + 'skip': 'redirects to YouTube', } def _real_extract(self, url): diff --git a/yt_dlp/extractor/nationalgeographic.py b/yt_dlp/extractor/nationalgeographic.py index ad525c258..6f046bc29 100644 --- a/yt_dlp/extractor/nationalgeographic.py +++ b/yt_dlp/extractor/nationalgeographic.py @@ -24,6 +24,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, { 'url': 'http://video.nationalgeographic.com/wild/when-sharks-attack/the-real-jaws', @@ -38,6 +39,7 @@ class NationalGeographicVideoIE(InfoExtractor): 'uploader': 'NAGS', }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }, ] @@ -75,6 +77,7 @@ class NationalGeographicTVIE(FOXIE): # XXX: Do not subclass from concrete IE 'params': { 'skip_download': True, }, + 'skip': 'Content not available', }] _HOME_PAGE_URL = 'https://www.nationalgeographic.com/tv/' _API_KEY = '238bb0a0c2aba67922c48709ce0c06fd' diff --git a/yt_dlp/extractor/nbc.py b/yt_dlp/extractor/nbc.py index b3c28ab55..666550a49 100644 --- a/yt_dlp/extractor/nbc.py +++ b/yt_dlp/extractor/nbc.py @@ -284,7 +284,7 @@ class NBCSportsIE(InfoExtractor): _TESTS = [{ # iframe src - 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', + 'url': 'https://www.nbcsports.com/watch/nfl/profootballtalk/pft-pm/unpacking-addisons-reckless-driving-citation', 'info_dict': { 'id': 'PHJSaFWbrTY9', 'ext': 'mp4', @@ -379,7 +379,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _TESTS = [ { 'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880', - 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf', + 'md5': 'fb3dcd2d7b1dd9804305fa2fc95ab610', # md5 tends to fluctuate 'info_dict': { 'id': '269389891880', 'ext': 'mp4', @@ -387,6 +387,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 'timestamp': 1401363060, 'upload_date': '20140529', + 'duration': 46.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/140529/p_tweet_snow_140529.jpg', }, }, { @@ -402,7 +404,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE }, { 'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', - 'md5': '8eb831eca25bfa7d25ddd83e85946548', + 'md5': '40d0e48c68896359c80372306ece0fc3', 'info_dict': { 'id': '394064451844', 'ext': 'mp4', @@ -410,11 +412,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 'timestamp': 1423104900, 'upload_date': '20150205', + 'duration': 1236.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/nn_netcast_150204.jpg', }, }, { 'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456', - 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0', + 'md5': 'ffb59bcf0733dc3c7f0ace907f5e3939', 'info_dict': { 'id': 'n431456', 'ext': 'mp4', @@ -422,11 +426,13 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301', 'upload_date': '20150922', 'timestamp': 1442917800, + 'duration': 37.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/__NEW/x_lon_vwhorn_150922.jpg', }, }, { 'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788', - 'md5': '118d7ca3f0bea6534f119c68ef539f71', + 'md5': '693d1fa21d23afcc9b04c66b227ed9ff', 'info_dict': { 'id': '669831235788', 'ext': 'mp4', @@ -434,6 +440,8 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1', 'upload_date': '20160420', 'timestamp': 1461152093, + 'duration': 69.0, + 'thumbnail': 'https://media-cldnry.s-nbcnews.com/image/upload/MSNBC/Components/Video/201604/2016-04-20T11-35-09-133Z--1280x720.jpg', }, }, { @@ -447,6 +455,7 @@ class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1406937606, 'upload_date': '20140802', + 'duration': 940.0, }, }, { @@ -535,6 +544,7 @@ class NBCOlympicsIE(InfoExtractor): 'upload_date': '20160815', 'uploader': 'NBCU-SPORTS', }, + 'skip': '404 Not Found', } def _real_extract(self, url): @@ -578,6 +588,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, { 'note': 'Plain m3u8 source URL', 'url': 'https://stream.nbcolympics.com/gymnastics-event-finals-mens-floor-pommel-horse-womens-vault-bars', @@ -589,6 +600,7 @@ class NBCOlympicsStreamIE(AdobePassIE): 'params': { 'skip_download': 'm3u8', }, + 'skip': 'Livestream', }, ] diff --git a/yt_dlp/extractor/scrippsnetworks.py b/yt_dlp/extractor/scrippsnetworks.py index adfd7e5f2..7f0bc9645 100644 --- a/yt_dlp/extractor/scrippsnetworks.py +++ b/yt_dlp/extractor/scrippsnetworks.py @@ -39,6 +39,7 @@ class ScrippsNetworksWatchIE(AWSIE): 'skip_download': True, }, 'add_ie': [AnvatoIE.ie_key()], + 'skip': '404 Not Found', }] _SNI_TABLE = { @@ -113,6 +114,9 @@ class ScrippsNetworksIE(InfoExtractor): 'timestamp': 1475678834, 'upload_date': '20161005', 'uploader': 'SCNI-SCND', + 'duration': 29.995, + 'chapters': [{'start_time': 0.0, 'end_time': 29.995, 'title': ''}], + 'thumbnail': 'https://images.dds.discovery.com/up/tp/Scripps_-_Food_Category_Prod/122/987/0260338_630x355.jpg', }, 'add_ie': ['ThePlatform'], 'expected_warnings': ['No HLS formats found'], diff --git a/yt_dlp/extractor/syfy.py b/yt_dlp/extractor/syfy.py index c79d27a0d..afcdbf780 100644 --- a/yt_dlp/extractor/syfy.py +++ b/yt_dlp/extractor/syfy.py @@ -23,6 +23,7 @@ class SyfyIE(AdobePassIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'Redirects to main page', }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/theplatform.py b/yt_dlp/extractor/theplatform.py index 99caeb5f9..433ce8427 100644 --- a/yt_dlp/extractor/theplatform.py +++ b/yt_dlp/extractor/theplatform.py @@ -167,7 +167,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): # rtmp download 'skip_download': True, }, - 'skip': '404 Not Found', + 'skip': 'CNet no longer uses ThePlatform', }, { 'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD', 'info_dict': { @@ -177,7 +177,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'title': 'HIGHLIGHTS: USA bag first ever series Cup win', 'uploader': 'EGSM', }, - 'skip': '404 Not Found', + 'skip': 'Dead link', }, { 'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7', 'only_matching': True, @@ -195,7 +195,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): 'upload_date': '20150701', 'uploader': 'NBCU-NEWS', }, - 'skip': '404 Not Found', + 'skip': 'Error: Player PID "nbcNewsOffsite" is disabled', }, { # From http://www.nbc.com/the-blacklist/video/sir-crispin-crandall/2928790?onid=137781#vc137781=1 # geo-restricted (US), HLS encrypted with AES-128 diff --git a/yt_dlp/extractor/theweatherchannel.py b/yt_dlp/extractor/theweatherchannel.py index 682e4335d..d1921e4f9 100644 --- a/yt_dlp/extractor/theweatherchannel.py +++ b/yt_dlp/extractor/theweatherchannel.py @@ -11,17 +11,19 @@ class TheWeatherChannelIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?weather\.com(?P(?:/(?P[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P[^/?#]+))' _TESTS = [{ - 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', + 'url': 'https://weather.com/storms/hurricane/video/invest-95l-in-atlantic-has-a-medium-chance-of-development', + 'md5': '68f0cf616435683f27ce36bd9c927394', 'info_dict': { - 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', + 'id': '81acef2d-ee8c-4545-ba83-bff3cc80db97', 'ext': 'mp4', - 'title': 'Ice Climber Is In For A Shock', - 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', - 'uploader': 'TWC - Digital (No Distro)', - 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', - 'upload_date': '20160720', - 'timestamp': 1469018835, + 'title': 'Invest 95L In Atlantic Has A Medium Chance Of Development', + 'description': 'md5:0de720fd5f0d0e32207bd4c270fff824', + 'uploader': 'TWC - Digital', + 'uploader_id': 'b5a999e0-9e04-11e1-9ee2-001d092f5a10', + 'upload_date': '20230721', + 'timestamp': 1689967343, + 'display_id': 'invest-95l-in-atlantic-has-a-medium-chance-of-development', + 'duration': 34.0, } }, { 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', From 792f1e64f6a2beac51e85408d142b3118115c4fd Mon Sep 17 00:00:00 2001 From: Aleri Kaisattera <73682764+alerikaisattera@users.noreply.github.com> Date: Sat, 7 Oct 2023 05:56:47 +0600 Subject: [PATCH 213/218] [ie/theta] Remove extractors (#8251) Authored by: alerikaisattera --- yt_dlp/extractor/_extractors.py | 4 -- yt_dlp/extractor/theta.py | 90 --------------------------------- 2 files changed, 94 deletions(-) delete mode 100644 yt_dlp/extractor/theta.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index b10ef2f33..55c3c2f8e 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -2004,10 +2004,6 @@ ) from .thestar import TheStarIE from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE diff --git a/yt_dlp/extractor/theta.py b/yt_dlp/extractor/theta.py deleted file mode 100644 index ecf0ea091..000000000 --- a/yt_dlp/extractor/theta.py +++ /dev/null @@ -1,90 +0,0 @@ -from .common import InfoExtractor -from ..utils import try_get - - -class ThetaStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/(?!video/)(?P[a-z0-9-]+)' - _TESTS = [{ - 'url': 'https://www.theta.tv/davirus', - 'skip': 'The live may have ended', - 'info_dict': { - 'id': 'DaVirus', - 'ext': 'mp4', - 'title': 'I choose you - My Community is King -👀 - YO HABLO ESPANOL - CODE DAVIRUS', - 'thumbnail': r're:https://live-thumbnails-prod-theta-tv\.imgix\.net/thumbnail/.+\.jpg', - } - }, { - 'url': 'https://www.theta.tv/mst3k', - 'note': 'This channel is live 24/7', - 'info_dict': { - 'id': 'MST3K', - 'ext': 'mp4', - 'title': 'Mystery Science Theatre 3000 24/7 Powered by the THETA Network.', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', - } - }, { - 'url': 'https://www.theta.tv/contv-anime', - 'info_dict': { - 'id': 'ConTVAnime', - 'ext': 'mp4', - 'title': 'CONTV ANIME 24/7. Powered by THETA Network.', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+\.jpg', - } - }] - - def _real_extract(self, url): - channel_id = self._match_id(url) - info = self._download_json(f'https://api.theta.tv/v1/channel?alias={channel_id}', channel_id)['body'] - - m3u8_playlist = next( - data['url'] for data in info['live_stream']['video_urls'] - if data.get('type') != 'embed' and data.get('resolution') in ('master', 'source')) - - formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) - - channel = try_get(info, lambda x: x['user']['username']) # using this field instead of channel_id due to capitalization - - return { - 'id': channel, - 'title': try_get(info, lambda x: x['live_stream']['title']), - 'channel': channel, - 'view_count': try_get(info, lambda x: x['live_stream']['view_count']), - 'is_live': True, - 'formats': formats, - 'thumbnail': try_get(info, lambda x: x['live_stream']['thumbnail_url']), - } - - -class ThetaVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?theta\.tv/video/(?Pvid[a-z0-9]+)' - _TEST = { - 'url': 'https://www.theta.tv/video/vidiq6aaet3kzf799p0', - 'md5': '633d8c29eb276bb38a111dbd591c677f', - 'info_dict': { - 'id': 'vidiq6aaet3kzf799p0', - 'ext': 'mp4', - 'title': 'Theta EdgeCast Tutorial', - 'uploader': 'Pixiekittie', - 'description': 'md5:e316253f5bdced8b5a46bb50ae60a09f', - 'thumbnail': r're:https://user-prod-theta-tv\.imgix\.net/.+/vod_thumb/.+.jpg', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json(f'https://api.theta.tv/v1/video/{video_id}/raw', video_id)['body'] - - m3u8_playlist = try_get(info, lambda x: x['video_urls'][0]['url']) - - formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4', m3u8_id='hls') - - return { - 'id': video_id, - 'title': info.get('title'), - 'uploader': try_get(info, lambda x: x['user']['username']), - 'description': info.get('description'), - 'view_count': info.get('view_count'), - 'like_count': info.get('like_count'), - 'formats': formats, - 'thumbnail': info.get('thumbnail_url'), - } From 03e85ea99db76a2fddb65bf46f8819bda780aaf3 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Fri, 6 Oct 2023 20:00:15 -0500 Subject: [PATCH 214/218] [ie/youtube] Fix `heatmap` extraction (#8299) Closes #8189 Authored by: bashonly --- yt_dlp/extractor/youtube.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7e13aa779..b7ac3e9cc 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3292,16 +3292,15 @@ def _extract_chapters_from_engagement_panel(self, data, duration): chapter_time, chapter_title, duration) for contents in content_list)), []) - def _extract_heatmap_from_player_overlay(self, data): - content_list = traverse_obj(data, ( - 'playerOverlays', 'playerOverlayRenderer', 'decoratedPlayerBarRenderer', 'decoratedPlayerBarRenderer', 'playerBar', - 'multiMarkersPlayerBarRenderer', 'markersMap', ..., 'value', 'heatmap', 'heatmapRenderer', 'heatMarkers', {list})) - return next(filter(None, ( - traverse_obj(contents, (..., 'heatMarkerRenderer', { - 'start_time': ('timeRangeStartMillis', {functools.partial(float_or_none, scale=1000)}), - 'end_time': {lambda x: (x['timeRangeStartMillis'] + x['markerDurationMillis']) / 1000}, - 'value': ('heatMarkerIntensityScoreNormalized', {float_or_none}), - })) for contents in content_list)), None) + def _extract_heatmap(self, data): + return traverse_obj(data, ( + 'frameworkUpdates', 'entityBatchUpdate', 'mutations', + lambda _, v: v['payload']['macroMarkersListEntity']['markersList']['markerType'] == 'MARKER_TYPE_HEATMAP', + 'payload', 'macroMarkersListEntity', 'markersList', 'markers', ..., { + 'start_time': ('startMillis', {functools.partial(float_or_none, scale=1000)}), + 'end_time': {lambda x: (int(x['startMillis']) + int(x['durationMillis'])) / 1000}, + 'value': ('intensityScoreNormalized', {float_or_none}), + })) or None def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -4435,7 +4434,7 @@ def process_language(container, base_url, lang_code, sub_name, query): or self._extract_chapters_from_description(video_description, duration) or None) - info['heatmap'] = self._extract_heatmap_from_player_overlay(initial_data) + info['heatmap'] = self._extract_heatmap(initial_data) contents = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents'), From 377e85a1797db9e98b78b38203ed9d4ded229991 Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sat, 7 Oct 2023 03:02:45 +0200 Subject: [PATCH 215/218] [cleanup] Misc (#8300) * Simplify nuxt regex * Fix tmz quotes and tests * Update test python versions Authored by: dirkf, gamer191, Grub4K --- .github/workflows/core.yml | 4 +- .github/workflows/download.yml | 2 +- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/tmz.py | 266 +++++++++++++++++---------------- 4 files changed, 138 insertions(+), 136 deletions(-) diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 689408c50..7fcf11dfa 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -13,7 +13,7 @@ jobs: matrix: os: [ubuntu-latest] # CPython 3.11 is in quick-test - python-version: ['3.8', '3.9', '3.10', '3.12-dev', pypy-3.7, pypy-3.8, pypy-3.10] + python-version: ['3.8', '3.9', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows @@ -21,7 +21,7 @@ jobs: python-version: '3.7' run-tests-ext: bat - os: windows-latest - python-version: '3.12-dev' + python-version: '3.12' run-tests-ext: bat - os: windows-latest python-version: pypy-3.9 diff --git a/.github/workflows/download.yml b/.github/workflows/download.yml index 2b2387d4f..c3478721c 100644 --- a/.github/workflows/download.yml +++ b/.github/workflows/download.yml @@ -28,7 +28,7 @@ jobs: fail-fast: true matrix: os: [ubuntu-latest] - python-version: ['3.7', '3.10', 3.11-dev, pypy-3.7, pypy-3.8] + python-version: ['3.7', '3.10', '3.12', pypy-3.7, pypy-3.8, pypy-3.10] run-tests-ext: [sh] include: # atleast one of each CPython/PyPy tests must be in windows diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c94b4abdc..c3ceb0039 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1687,7 +1687,7 @@ def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) - FUNCTION_RE = r'\(function\((?P.*?)\){(?:.*?)return\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' + FUNCTION_RE = r'\(function\((?P.*?)\){.*?\breturn\s+(?P{.*?})\s*;?\s*}\((?P.*?)\)' js, arg_keys, arg_vals = self._search_regex( (rf'', rf'{rectx}\(.*?{FUNCTION_RE}'), webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), diff --git a/yt_dlp/extractor/tmz.py b/yt_dlp/extractor/tmz.py index ffb30c6b8..edd16bc5b 100644 --- a/yt_dlp/extractor/tmz.py +++ b/yt_dlp/extractor/tmz.py @@ -8,158 +8,160 @@ class TMZIE(InfoExtractor): - _VALID_URL = r"https?://(?:www\.)?tmz\.com/.*" + _VALID_URL = r'https?://(?:www\.)?tmz\.com/.*' _TESTS = [ { - "url": "http://www.tmz.com/videos/0-cegprt2p/", - "info_dict": { - "id": "http://www.tmz.com/videos/0-cegprt2p/", - "ext": "mp4", - "title": "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", - "description": "Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.", - "timestamp": 1467831837, - "uploader": "TMZ Staff", - "upload_date": "20160706", - "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg", - "duration": 772.0, + 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'info_dict': { + 'id': 'http://www.tmz.com/videos/0-cegprt2p/', + 'ext': 'mp4', + 'title': 'No Charges Against Hillary Clinton? Harvey Says It Ain\'t Over Yet', + 'description': 'Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.', + 'timestamp': 1467831837, + 'uploader': 'TMZ Staff', + 'upload_date': '20160706', + 'thumbnail': 'https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg', + 'duration': 772.0, }, }, { - "url": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", - "info_dict": { - "id": "https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/", - "ext": "mp4", - "title": "Angry Bagel Shop Guy Says He Doesn't Trust Women", - "description": "The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it's women's fault in the first place.", - "timestamp": 1562889485, - "uploader": "TMZ Staff", - "upload_date": "20190711", - "thumbnail": "https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg", - "duration": 123.0, + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'info_dict': { + 'id': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'ext': 'mp4', + 'title': 'Angry Bagel Shop Guy Says He Doesn\'t Trust Women', + 'description': 'The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it\'s women\'s fault in the first place.', + 'timestamp': 1562889485, + 'uploader': 'TMZ Staff', + 'upload_date': '20190711', + 'thumbnail': 'https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg', + 'duration': 123.0, }, }, { - "url": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", - "md5": "5429c85db8bde39a473a56ca8c4c5602", - "info_dict": { - "id": "http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert", - "ext": "mp4", - "title": "Bobby Brown Tells Crowd ... Bobbi Kristina is Awake", - "description": 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - "timestamp": 1429467813, - "uploader": "TMZ Staff", - "upload_date": "20150419", - "duration": 29.0, - "thumbnail": "https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg", + 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'md5': '5429c85db8bde39a473a56ca8c4c5602', + 'info_dict': { + 'id': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', + 'ext': 'mp4', + 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', + 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', + 'timestamp': 1429467813, + 'uploader': 'TMZ Staff', + 'upload_date': '20150419', + 'duration': 29.0, + 'thumbnail': 'https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg', }, }, { - "url": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", - "info_dict": { - "id": "http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/", - "ext": "mp4", - "title": "Patti LaBelle -- Goes Nuclear On Stripping Fan", - "description": "Patti LaBelle made it known loud and clear last night ... NO " - "ONE gets on her stage and strips down.", - "timestamp": 1442683746, - "uploader": "TMZ Staff", - "upload_date": "20150919", - "duration": 104.0, - "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg", + 'url': 'http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/', + 'info_dict': { + 'id': 'http://www.tmz.com/2015/09/19/patti-labelle-concert-fan-stripping-kicked-out-nicki-minaj/', + 'ext': 'mp4', + 'title': 'Patti LaBelle -- Goes Nuclear On Stripping Fan', + 'description': 'Patti LaBelle made it known loud and clear last night ... NO ' + 'ONE gets on her stage and strips down.', + 'timestamp': 1442683746, + 'uploader': 'TMZ Staff', + 'upload_date': '20150919', + 'duration': 104.0, + 'thumbnail': 'https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg', }, }, { - "url": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", - "info_dict": { - "id": "http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/", - "ext": "mp4", - "title": "NBA's Adam Silver -- Blake Griffin's a Great Guy ... He'll Learn from This", - "description": "Two pretty parts of this video with NBA Commish Adam Silver.", - "timestamp": 1454010989, - "uploader": "TMZ Staff", - "upload_date": "20160128", - "duration": 59.0, - "thumbnail": "https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg", + 'url': 'http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/', + 'info_dict': { + 'id': 'http://www.tmz.com/2016/01/28/adam-silver-sting-drake-blake-griffin/', + 'ext': 'mp4', + 'title': 'NBA\'s Adam Silver -- Blake Griffin\'s a Great Guy ... He\'ll Learn from This', + 'description': 'Two pretty parts of this video with NBA Commish Adam Silver.', + 'timestamp': 1454010989, + 'uploader': 'TMZ Staff', + 'upload_date': '20160128', + 'duration': 59.0, + 'thumbnail': 'https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg', }, }, { - "url": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", - "info_dict": { - "id": "http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/", - "ext": "mp4", - "title": "Trump Star Vandal -- I'm Not Afraid of Donald or the Cops!", - "description": "James Otis is the the guy who took a pickaxe to Donald Trump's star on the Walk of Fame, and he tells TMZ .. he's ready and willing to go to jail for the crime.", - "timestamp": 1477500095, - "uploader": "TMZ Staff", - "upload_date": "20161026", - "thumbnail": "https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg", - "duration": 128.0, + 'url': 'http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/', + 'info_dict': { + 'id': 'http://www.tmz.com/2016/10/27/donald-trump-star-vandal-arrested-james-otis/', + 'ext': 'mp4', + 'title': 'Trump Star Vandal -- I\'m Not Afraid of Donald or the Cops!', + 'description': 'James Otis is the the guy who took a pickaxe to Donald Trump\'s star on the Walk of Fame, and he tells TMZ .. he\'s ready and willing to go to jail for the crime.', + 'timestamp': 1477500095, + 'uploader': 'TMZ Staff', + 'upload_date': '20161026', + 'thumbnail': 'https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg', + 'duration': 128.0, }, }, { - "url": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", - "info_dict": { - "id": "https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/", - "ext": "mp4", - "title": "Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist " - "Demonstrators", - "description": "Beverly Hills may be an omen of what's coming next week, " - "because things got crazy on the streets and cops started " - "swinging their billy clubs at both Anti-Fascist and Pro-Trump " - "demonstrators.", - "timestamp": 1604182772, - "uploader": "TMZ Staff", - "upload_date": "20201031", - "duration": 96.0, - "thumbnail": "https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg", + 'url': 'https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/', + 'info_dict': { + 'id': 'https://www.tmz.com/videos/2020-10-31-103120-beverly-hills-protest-4878209/', + 'ext': 'mp4', + 'title': 'Cops Use Billy Clubs Against Pro-Trump and Anti-Fascist ' + 'Demonstrators', + 'description': 'Beverly Hills may be an omen of what\'s coming next week, ' + 'because things got crazy on the streets and cops started ' + 'swinging their billy clubs at both Anti-Fascist and Pro-Trump ' + 'demonstrators.', + 'timestamp': 1604182772, + 'uploader': 'TMZ Staff', + 'upload_date': '20201031', + 'duration': 96.0, + 'thumbnail': 'https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg', }, }, { - "url": "https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/", - "info_dict": { - "id": "Dddb6IGe-ws", - "ext": "mp4", - "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing", - "uploader": "ESNEWS", - "description": "md5:49675bc58883ccf80474b8aa701e1064", - "upload_date": "20201102", - "uploader_id": "ESNEWS", - "uploader_url": "http://www.youtube.com/user/ESNEWS", - "like_count": int, - "channel_id": "UCI-Oq7oFGakzSzHFlTtsUsQ", - "channel": "ESNEWS", - "view_count": int, - "duration": 225, - "live_status": "not_live", - "thumbnail": "https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp", - "channel_url": "https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ", - "channel_follower_count": int, - "playable_in_embed": True, - "categories": ["Sports"], - "age_limit": 0, - "tags": "count:10", - "availability": "public", + 'url': 'https://www.tmz.com/2020/11/05/gervonta-davis-car-crash-hit-and-run-police/', + 'info_dict': { + 'id': 'Dddb6IGe-ws', + 'ext': 'mp4', + 'title': 'SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing', + 'uploader': 'ESNEWS', + 'description': 'md5:49675bc58883ccf80474b8aa701e1064', + 'upload_date': '20201102', + 'uploader_id': '@ESNEWS', + 'uploader_url': 'https://www.youtube.com/@ESNEWS', + 'like_count': int, + 'channel_id': 'UCI-Oq7oFGakzSzHFlTtsUsQ', + 'channel': 'ESNEWS', + 'view_count': int, + 'duration': 225, + 'live_status': 'not_live', + 'thumbnail': 'https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp', + 'channel_url': 'https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'categories': ['Sports'], + 'age_limit': 0, + 'tags': 'count:10', + 'availability': 'public', + 'comment_count': int, }, }, { - "url": "https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/", - "info_dict": { - "id": "1329450007125225473", - "ext": "mp4", - "title": "The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", - "uploader": "The Mac Life", - "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69", - "upload_date": "20201119", - "uploader_id": "TheMacLife", - "timestamp": 1605800556, - "thumbnail": "https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small", - "like_count": int, - "duration": 11.812, - "uploader_url": "https://twitter.com/TheMacLife", - "age_limit": 0, - "repost_count": int, - "tags": [], - "comment_count": int, + 'url': 'https://www.tmz.com/2020/11/19/conor-mcgregor-dustin-poirier-contract-fight-ufc-257-fight-island/', + 'info_dict': { + 'id': '1329448013937471491', + 'ext': 'mp4', + 'title': 'The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.', + 'uploader': 'The Mac Life', + 'description': 'md5:56e6009bbc3d12498e10d08a8e1f1c69', + 'upload_date': '20201119', + 'display_id': '1329450007125225473', + 'uploader_id': 'TheMacLife', + 'timestamp': 1605800556, + 'thumbnail': 'https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small', + 'like_count': int, + 'duration': 11.812, + 'uploader_url': 'https://twitter.com/TheMacLife', + 'age_limit': 0, + 'repost_count': int, + 'tags': [], + 'comment_count': int, }, }, ] @@ -167,25 +169,25 @@ class TMZIE(InfoExtractor): def _real_extract(self, url): webpage = self._download_webpage(url, url) jsonld = self._search_json_ld(webpage, url) - if not jsonld or "url" not in jsonld: + if not jsonld or 'url' not in jsonld: # try to extract from YouTube Player API # see https://developers.google.com/youtube/iframe_api_reference#Video_Queueing_Functions match_obj = re.search(r'\.cueVideoById\(\s*(?P[\'"])(?P.*?)(?P=quote)', webpage) if match_obj: - res = self.url_result(match_obj.group("id")) + res = self.url_result(match_obj.group('id')) return res # try to extract from twitter - blockquote_el = get_element_by_attribute("class", "twitter-tweet", webpage) + blockquote_el = get_element_by_attribute('class', 'twitter-tweet', webpage) if blockquote_el: matches = re.findall( r']+href=\s*(?P[\'"])(?P.*?)(?P=quote)', blockquote_el) if matches: for _, match in matches: - if "/status/" in match: + if '/status/' in match: res = self.url_result(match) return res - raise ExtractorError("No video found!") + raise ExtractorError('No video found!') if id not in jsonld: - jsonld["id"] = url + jsonld['id'] = url return jsonld From 4392c4680c383b221b6aa26d25c6e4b5581a5ad6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Sat, 7 Oct 2023 01:28:34 +0000 Subject: [PATCH 216/218] Release 2023.10.07 Created by: Grub4K :ci skip all :ci run dl --- .github/ISSUE_TEMPLATE/1_broken_site.yml | 8 ++--- .../ISSUE_TEMPLATE/2_site_support_request.yml | 8 ++--- .../ISSUE_TEMPLATE/3_site_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/4_bug_report.yml | 8 ++--- .github/ISSUE_TEMPLATE/5_feature_request.yml | 8 ++--- .github/ISSUE_TEMPLATE/6_question.yml | 8 ++--- CONTRIBUTORS | 6 ++++ Changelog.md | 29 +++++++++++++++++++ supportedsites.md | 4 +-- yt_dlp/version.py | 4 +-- 10 files changed, 63 insertions(+), 28 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.yml b/.github/ISSUE_TEMPLATE/1_broken_site.yml index f0fc71d57..dacb41758 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.yml +++ b/.github/ISSUE_TEMPLATE/1_broken_site.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting that yt-dlp is broken on a **supported** site required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -64,7 +64,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -72,8 +72,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.yml b/.github/ISSUE_TEMPLATE/2_site_support_request.yml index ac9a72a1c..ec6e298a1 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.yml +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a new site support request required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -76,7 +76,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -84,8 +84,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml index 577e4d491..cf3cdd21f 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.yml @@ -18,7 +18,7 @@ body: options: - label: I'm requesting a site-specific feature required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -72,7 +72,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -80,8 +80,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.yml b/.github/ISSUE_TEMPLATE/4_bug_report.yml index 9529c1bd6..1bbcf6895 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.yml +++ b/.github/ISSUE_TEMPLATE/4_bug_report.yml @@ -18,7 +18,7 @@ body: options: - label: I'm reporting a bug unrelated to a specific site required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've checked that all provided URLs are playable in a browser with the same IP and same login details required: true @@ -57,7 +57,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -65,8 +65,8 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell validations: diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.yml b/.github/ISSUE_TEMPLATE/5_feature_request.yml index b17a6e046..d3bc06e80 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.yml +++ b/.github/ISSUE_TEMPLATE/5_feature_request.yml @@ -20,7 +20,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar issues **including closed ones**. DO NOT post duplicates required: true @@ -53,7 +53,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -61,7 +61,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell diff --git a/.github/ISSUE_TEMPLATE/6_question.yml b/.github/ISSUE_TEMPLATE/6_question.yml index 5345e8917..30311d5b5 100644 --- a/.github/ISSUE_TEMPLATE/6_question.yml +++ b/.github/ISSUE_TEMPLATE/6_question.yml @@ -26,7 +26,7 @@ body: required: true - label: I've looked through the [README](https://github.com/yt-dlp/yt-dlp#readme) required: true - - label: I've verified that I'm running yt-dlp version **2023.09.24** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) + - label: I've verified that I'm running yt-dlp version **2023.10.07** ([update instructions](https://github.com/yt-dlp/yt-dlp#update)) or later (specify commit) required: true - label: I've searched [known issues](https://github.com/yt-dlp/yt-dlp/issues/3766) and the [bugtracker](https://github.com/yt-dlp/yt-dlp/issues?q=) for similar questions **including closed ones**. DO NOT post duplicates required: true @@ -59,7 +59,7 @@ body: [debug] Command-line config: ['-vU', 'test:youtube'] [debug] Portable config "yt-dlp.conf": ['-i'] [debug] Encodings: locale cp65001, fs utf-8, pref cp65001, out utf-8, error utf-8, screen utf-8 - [debug] yt-dlp version 2023.09.24 [9d339c4] (win32_exe) + [debug] yt-dlp version 2023.10.07 [9d339c4] (win32_exe) [debug] Python 3.8.10 (CPython 64bit) - Windows-10-10.0.22000-SP0 [debug] Checking exe version: ffmpeg -bsfs [debug] Checking exe version: ffprobe -bsfs @@ -67,7 +67,7 @@ body: [debug] Optional libraries: Cryptodome-3.15.0, brotli-1.0.9, certifi-2022.06.15, mutagen-1.45.1, sqlite3-2.6.0, websockets-10.3 [debug] Proxy map: {} [debug] Fetching release info: https://api.github.com/repos/yt-dlp/yt-dlp/releases/latest - Latest version: 2023.09.24, Current version: 2023.09.24 - yt-dlp is up to date (2023.09.24) + Latest version: 2023.10.07, Current version: 2023.10.07 + yt-dlp is up to date (2023.10.07) render: shell diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 72b9584ec..8eda41307 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -503,3 +503,9 @@ Yalab7 zhallgato zhong-yiyu Zprokkel +AS6939 +drzraf +handlerug +jiru +madewokherd +xofe diff --git a/Changelog.md b/Changelog.md index 04511927f..48dcbf102 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,35 @@ # Changelog # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2023.10.07 + +#### Extractor changes +- **abc.net.au**: iview: [Improve `episode` extraction](https://github.com/yt-dlp/yt-dlp/commit/a9efb4b8d74f3583450ffda0ee57259a47d39c70) ([#8201](https://github.com/yt-dlp/yt-dlp/issues/8201)) by [xofe](https://github.com/xofe) +- **erocast**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/47c598783c98c179e04dd12c2a3fee0f3dc53087) ([#8264](https://github.com/yt-dlp/yt-dlp/issues/8264)) by [madewokherd](https://github.com/madewokherd) +- **gofile**: [Fix token cookie bug](https://github.com/yt-dlp/yt-dlp/commit/0730d5a966fa8a937d84bfb7f68be5198acb039b) by [bashonly](https://github.com/bashonly) +- **iq.com**: [Fix extraction and subtitles](https://github.com/yt-dlp/yt-dlp/commit/35d9cbaf9638ccc9daf8a863063b2e7c135bc664) ([#8260](https://github.com/yt-dlp/yt-dlp/issues/8260)) by [AS6939](https://github.com/AS6939) +- **lbry** + - [Add playlist support](https://github.com/yt-dlp/yt-dlp/commit/48cceec1ddb8649b5e771df8df79eb9c39c82b90) ([#8213](https://github.com/yt-dlp/yt-dlp/issues/8213)) by [bashonly](https://github.com/bashonly), [drzraf](https://github.com/drzraf), [Grub4K](https://github.com/Grub4K) + - [Extract `uploader_id`](https://github.com/yt-dlp/yt-dlp/commit/0e722f2f3ca42e634fd7b06ee70b16bf833ce132) ([#8244](https://github.com/yt-dlp/yt-dlp/issues/8244)) by [drzraf](https://github.com/drzraf) +- **litv**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/91a670a4f7babe9c8aa2018f57d8c8952a6f49d8) ([#7785](https://github.com/yt-dlp/yt-dlp/issues/7785)) by [jiru](https://github.com/jiru) +- **neteasemusic**: [Fix extractors](https://github.com/yt-dlp/yt-dlp/commit/f980df734cf5c0eaded2f7b38c6c60bccfeebb48) ([#8181](https://github.com/yt-dlp/yt-dlp/issues/8181)) by [c-basalt](https://github.com/c-basalt) +- **nhk**: [Fix VOD extraction](https://github.com/yt-dlp/yt-dlp/commit/e831c80e8b2fc025b3b67d82974cc59e3526fdc8) ([#8249](https://github.com/yt-dlp/yt-dlp/issues/8249)) by [garret1317](https://github.com/garret1317) +- **radiko**: [Improve extraction](https://github.com/yt-dlp/yt-dlp/commit/2ad3873f0dfa9285c91d2160e36c039e69d597c7) ([#8221](https://github.com/yt-dlp/yt-dlp/issues/8221)) by [garret1317](https://github.com/garret1317) +- **substack** + - [Fix download cookies bug](https://github.com/yt-dlp/yt-dlp/commit/2f2dda3a7e85148773da3cdbc03ac9949ec1bc45) ([#8219](https://github.com/yt-dlp/yt-dlp/issues/8219)) by [handlerug](https://github.com/handlerug) + - [Fix embed extraction](https://github.com/yt-dlp/yt-dlp/commit/fbcc299bd8a19cf8b3c8805d6c268a9110230973) ([#8218](https://github.com/yt-dlp/yt-dlp/issues/8218)) by [handlerug](https://github.com/handlerug) +- **theta**: [Remove extractors](https://github.com/yt-dlp/yt-dlp/commit/792f1e64f6a2beac51e85408d142b3118115c4fd) ([#8251](https://github.com/yt-dlp/yt-dlp/issues/8251)) by [alerikaisattera](https://github.com/alerikaisattera) +- **wrestleuniversevod**: [Call API with device ID](https://github.com/yt-dlp/yt-dlp/commit/b095fd3fa9d58a65dc9b830bd63b9d909422aa86) ([#8272](https://github.com/yt-dlp/yt-dlp/issues/8272)) by [bashonly](https://github.com/bashonly) +- **xhamster**: user: [Support creator urls](https://github.com/yt-dlp/yt-dlp/commit/cc8d8441524ec3442d7c0d3f8f33f15b66aa06f3) ([#8232](https://github.com/yt-dlp/yt-dlp/issues/8232)) by [Grub4K](https://github.com/Grub4K) +- **youtube** + - [Fix `heatmap` extraction](https://github.com/yt-dlp/yt-dlp/commit/03e85ea99db76a2fddb65bf46f8819bda780aaf3) ([#8299](https://github.com/yt-dlp/yt-dlp/issues/8299)) by [bashonly](https://github.com/bashonly) + - [Raise a warning for `Incomplete Data` instead of an error](https://github.com/yt-dlp/yt-dlp/commit/eb5bdbfa70126c7d5355cc0954b63720522e462c) ([#8238](https://github.com/yt-dlp/yt-dlp/issues/8238)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **cleanup** + - [Update extractor tests](https://github.com/yt-dlp/yt-dlp/commit/19c90e405b4137c06dfe6f9aaa02396df0da93e5) ([#7718](https://github.com/yt-dlp/yt-dlp/issues/7718)) by [trainman261](https://github.com/trainman261) + - Miscellaneous: [377e85a](https://github.com/yt-dlp/yt-dlp/commit/377e85a1797db9e98b78b38203ed9d4ded229991) by [dirkf](https://github.com/dirkf), [gamer191](https://github.com/gamer191), [Grub4K](https://github.com/Grub4K) + ### 2023.09.24 #### Important changes diff --git a/supportedsites.md b/supportedsites.md index 620e0f305..ecef4dc2d 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -422,6 +422,7 @@ # Supported sites - **eplus:inbound**: e+ (イープラス) overseas - **Epoch** - **Eporner** + - **Erocast** - **EroProfile**: [*eroprofile*](## "netrc machine") - **EroProfile:album** - **ertflix**: ERTFLIX videos @@ -699,6 +700,7 @@ # Supported sites - **LastFMUser** - **lbry** - **lbry:channel** + - **lbry:playlist** - **LCI** - **Lcp** - **LcpPlay** @@ -1474,8 +1476,6 @@ # Supported sites - **ThePlatformFeed** - **TheStar** - **TheSun** - - **ThetaStream** - - **ThetaVideo** - **TheWeatherChannel** - **ThisAmericanLife** - **ThisAV** diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 2a7c84b93..60c1c94cc 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2023.09.24' +__version__ = '2023.10.07' -RELEASE_GIT_HEAD = '088add9567d39b758737e4299a0e619fd89d2e8f' +RELEASE_GIT_HEAD = '377e85a1797db9e98b78b38203ed9d4ded229991' VARIANT = None From 9d7ded6419089c1bf252496073f73ad90ed71004 Mon Sep 17 00:00:00 2001 From: Awal Garg Date: Sun, 8 Oct 2023 01:57:23 +0200 Subject: [PATCH 217/218] [utils] `js_to_json`: Fix `Date` constructor parsing (#8295) Authored by: awalgarg, Grub4K --- test/test_utils.py | 7 ++++++- yt_dlp/utils/_utils.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index fd612ff86..77040f29c 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1209,6 +1209,9 @@ def test_js_to_json_edgecases(self): on = js_to_json('\'"\\""\'') self.assertEqual(json.loads(on), '"""', msg='Unnecessary quote escape should be escaped') + on = js_to_json('[new Date("spam"), \'("eggs")\']') + self.assertEqual(json.loads(on), ['spam', '("eggs")'], msg='Date regex should match a single string') + def test_js_to_json_malformed(self): self.assertEqual(js_to_json('42a1'), '42"a1"') self.assertEqual(js_to_json('42a-1'), '42"a"-1') @@ -1220,11 +1223,13 @@ def test_js_to_json_template_literal(self): self.assertEqual(js_to_json('`${name}"${name}"`', {'name': '5'}), '"5\\"5\\""') self.assertEqual(js_to_json('`${name}`', {}), '"name"') - def test_js_to_json_map_array_constructors(self): + def test_js_to_json_common_constructors(self): self.assertEqual(json.loads(js_to_json('new Map([["a", 5]])')), {'a': 5}) self.assertEqual(json.loads(js_to_json('Array(5, 10)')), [5, 10]) self.assertEqual(json.loads(js_to_json('new Array(15,5)')), [15, 5]) self.assertEqual(json.loads(js_to_json('new Map([Array(5, 10),new Array(15,5)])')), {'5': 10, '15': 5}) + self.assertEqual(json.loads(js_to_json('new Date("123")')), "123") + self.assertEqual(json.loads(js_to_json('new Date(\'2023-10-19\')')), "2023-10-19") def test_extract_attributes(self): self.assertEqual(extract_attributes(''), {'x': 'y'}) diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index ba6242380..3dc17bf59 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -2744,7 +2744,7 @@ def create_map(mobj): code = re.sub(r'(?:new\s+)?Array\((.*?)\)', r'[\g<1>]', code) code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) if not strict: - code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(rf'new Date\(({STRING_RE})\)', r'\g<1>', code) code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) code = re.sub(r'parseInt\([^\d]+(\d+)[^\d]+\)', r'\1', code) code = re.sub(r'\(function\([^)]*\)\s*\{[^}]*\}\s*\)\s*\(\s*(["\'][^)]*["\'])\s*\)', r'\1', code) From 1c51c520f7b511ebd9e4eb7322285a8c31eedbbd Mon Sep 17 00:00:00 2001 From: Simon Sawicki Date: Sun, 8 Oct 2023 02:01:01 +0200 Subject: [PATCH 218/218] [fd/fragment] Improve progress calculation (#8241) This uses the download speed from all threads and also adds smoothing to speed and eta Authored by: Grub4K --- yt_dlp/downloader/fragment.py | 48 ++++++--------- yt_dlp/utils/progress.py | 109 ++++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+), 29 deletions(-) create mode 100644 yt_dlp/utils/progress.py diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index b4b680dae..b4f003d37 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -14,6 +14,7 @@ from ..networking.exceptions import HTTPError, IncompleteRead from ..utils import DownloadError, RetryManager, encodeFilename, traverse_obj from ..utils.networking import HTTPHeaderDict +from ..utils.progress import ProgressCalculator class HttpQuietDownloader(HttpFD): @@ -226,8 +227,7 @@ def _start_frag_download(self, ctx, info_dict): resume_len = ctx['complete_frags_downloaded_bytes'] total_frags = ctx['total_frags'] ctx_id = ctx.get('ctx_id') - # This dict stores the download progress, it's updated by the progress - # hook + # Stores the download progress, updated by the progress hook state = { 'status': 'downloading', 'downloaded_bytes': resume_len, @@ -237,14 +237,8 @@ def _start_frag_download(self, ctx, info_dict): 'tmpfilename': ctx['tmpfilename'], } - start = time.time() - ctx.update({ - 'started': start, - 'fragment_started': start, - # Amount of fragment's bytes downloaded by the time of the previous - # frag progress hook invocation - 'prev_frag_downloaded_bytes': 0, - }) + ctx['started'] = time.time() + progress = ProgressCalculator(resume_len) def frag_progress_hook(s): if s['status'] not in ('downloading', 'finished'): @@ -259,38 +253,35 @@ def frag_progress_hook(s): state['max_progress'] = ctx.get('max_progress') state['progress_idx'] = ctx.get('progress_idx') - time_now = time.time() - state['elapsed'] = time_now - start + state['elapsed'] = progress.elapsed frag_total_bytes = s.get('total_bytes') or 0 s['fragment_info_dict'] = s.pop('info_dict', {}) + + # XXX: Fragment resume is not accounted for here if not ctx['live']: estimated_size = ( (ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) / (state['fragment_index'] + 1) * total_frags) - state['total_bytes_estimate'] = estimated_size + progress.total = estimated_size + progress.update(s.get('downloaded_bytes')) + state['total_bytes_estimate'] = progress.total + else: + progress.update(s.get('downloaded_bytes')) if s['status'] == 'finished': state['fragment_index'] += 1 ctx['fragment_index'] = state['fragment_index'] - state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] - ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_total_bytes) - ctx['fragment_started'] = time.time() - ctx['prev_frag_downloaded_bytes'] = 0 - else: - frag_downloaded_bytes = s['downloaded_bytes'] - state['downloaded_bytes'] += frag_downloaded_bytes - ctx['prev_frag_downloaded_bytes'] - ctx['speed'] = state['speed'] = self.calc_speed( - ctx['fragment_started'], time_now, frag_downloaded_bytes - ctx.get('frag_resume_len', 0)) - if not ctx['live']: - state['eta'] = self.calc_eta(state['speed'], estimated_size - state['downloaded_bytes']) - ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes + progress.thread_reset() + + state['downloaded_bytes'] = ctx['complete_frags_downloaded_bytes'] = progress.downloaded + state['speed'] = ctx['speed'] = progress.speed.smooth + state['eta'] = progress.eta.smooth + self._hook_progress(state, info_dict) ctx['dl'].add_progress_hook(frag_progress_hook) - return start + return ctx['started'] def _finish_frag_download(self, ctx, info_dict): ctx['dest_stream'].close() @@ -500,7 +491,6 @@ def _download_fragment(fragment): download_fragment(fragment, ctx_copy) return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') - self.report_warning('The download speed shown is only of one thread. This is a known issue') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: try: for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): diff --git a/yt_dlp/utils/progress.py b/yt_dlp/utils/progress.py new file mode 100644 index 000000000..f254a3887 --- /dev/null +++ b/yt_dlp/utils/progress.py @@ -0,0 +1,109 @@ +from __future__ import annotations + +import bisect +import threading +import time + + +class ProgressCalculator: + # Time to calculate the speed over (seconds) + SAMPLING_WINDOW = 3 + # Minimum timeframe before to sample next downloaded bytes (seconds) + SAMPLING_RATE = 0.05 + # Time before showing eta (seconds) + GRACE_PERIOD = 1 + + def __init__(self, initial: int): + self._initial = initial or 0 + self.downloaded = self._initial + + self.elapsed: float = 0 + self.speed = SmoothValue(0, smoothing=0.7) + self.eta = SmoothValue(None, smoothing=0.9) + + self._total = 0 + self._start_time = time.monotonic() + self._last_update = self._start_time + + self._lock = threading.Lock() + self._thread_sizes: dict[int, int] = {} + + self._times = [self._start_time] + self._downloaded = [self.downloaded] + + @property + def total(self): + return self._total + + @total.setter + def total(self, value: int | None): + with self._lock: + if value is not None and value < self.downloaded: + value = self.downloaded + + self._total = value + + def thread_reset(self): + current_thread = threading.get_ident() + with self._lock: + self._thread_sizes[current_thread] = 0 + + def update(self, size: int | None): + if not size: + return + + current_thread = threading.get_ident() + + with self._lock: + last_size = self._thread_sizes.get(current_thread, 0) + self._thread_sizes[current_thread] = size + self._update(size - last_size) + + def _update(self, size: int): + current_time = time.monotonic() + + self.downloaded += size + self.elapsed = current_time - self._start_time + if self.total is not None and self.downloaded > self.total: + self._total = self.downloaded + + if self._last_update + self.SAMPLING_RATE > current_time: + return + self._last_update = current_time + + self._times.append(current_time) + self._downloaded.append(self.downloaded) + + offset = bisect.bisect_left(self._times, current_time - self.SAMPLING_WINDOW) + del self._times[:offset] + del self._downloaded[:offset] + if len(self._times) < 2: + self.speed.reset() + self.eta.reset() + return + + download_time = current_time - self._times[0] + if not download_time: + return + + self.speed.set((self.downloaded - self._downloaded[0]) / download_time) + if self.total and self.speed.value and self.elapsed > self.GRACE_PERIOD: + self.eta.set((self.total - self.downloaded) / self.speed.value) + else: + self.eta.reset() + + +class SmoothValue: + def __init__(self, initial: float | None, smoothing: float): + self.value = self.smooth = self._initial = initial + self._smoothing = smoothing + + def set(self, value: float): + self.value = value + if self.smooth is None: + self.smooth = self.value + else: + self.smooth = (1 - self._smoothing) * value + self._smoothing * self.smooth + + def reset(self): + self.value = self.smooth = self._initial