From 5b0a6a801084cced4b71c255270f53c881203ca8 Mon Sep 17 00:00:00 2001 From: insaneracist Date: Thu, 29 Oct 2020 16:11:14 -0700 Subject: [PATCH 01/49] [youtube] fix: extract mix playlist ids from ytInitialData (#33) --- youtube_dlc/extractor/youtube.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a2..0354866eff 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -279,6 +279,15 @@ def _download_webpage_handle(self, *args, **kwargs): return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) + def _get_yt_initial_data(self, video_id, webpage): + config = self._search_regex( + (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', + r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), + webpage, 'ytInitialData', default=None) + if config: + return self._parse_json( + uppercase_escape(config), video_id, fatal=False) + def _real_initialize(self): if self._downloader is None: return @@ -1397,15 +1406,6 @@ def _get_ytplayer_config(self, video_id, webpage): return self._parse_json( uppercase_escape(config), video_id, fatal=False) - def _get_yt_initial_data(self, video_id, webpage): - config = self._search_regex( - (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});', - r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'), - webpage, 'ytInitialData', default=None) - if config: - return self._parse_json( - uppercase_escape(config), video_id, fatal=False) - def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -2765,6 +2765,16 @@ def extract_videos_from_page(self, page): return zip(ids_in_page, titles_in_page) + def _extract_mix_ids_from_yt_initial(self, yt_initial): + ids = [] + playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents']) + if type(playlist_contents) is list: + for item in playlist_contents: + videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId']) + if type(videoId) is str: + ids.append(videoId) + return ids + def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -2778,6 +2788,13 @@ def _extract_mix(self, playlist_id): r'''(?xs)data-video-username=".*?".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), webpage)) + + # if no ids in html of page, try using embedded json + if (len(new_ids) == 0): + yt_initial = self._get_yt_initial_data(playlist_id, webpage) + if yt_initial: + new_ids = self._extract_mix_ids_from_yt_initial(yt_initial) + # Fetch new pages until all the videos are repeated, it seems that # there are always 51 unique videos. new_ids = [_id for _id in new_ids if _id not in ids] From 59c5fa91c167a8d011a4efa073ad6fd0027b2ed8 Mon Sep 17 00:00:00 2001 From: Peter Oettig Date: Fri, 30 Oct 2020 23:24:55 +0100 Subject: [PATCH 02/49] Fixed problem with new youtube player, leading to "Unable to extract video data". --- youtube_dlc/extractor/youtube.py | 123 ++++++++++++++++++------------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 5fd22081a2..3e1adc554d 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1390,6 +1390,7 @@ def _get_ytplayer_config(self, video_id, webpage): # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', + r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' ) config = self._search_regex( patterns, webpage, 'ytplayer.config', default=None) @@ -1416,10 +1417,11 @@ def _get_automatic_captions(self, video_id, webpage): self._downloader.report_warning(err_msg) return {} try: - args = player_config['args'] - caption_url = args.get('ttsurl') - if caption_url: + if "args" in player_config and "ttsurl" in player_config["args"]: + args = player_config['args'] + caption_url = args['ttsurl'] timestamp = args['timestamp'] + # We get the available subtitles list_params = compat_urllib_parse_urlencode({ 'type': 'list', @@ -1475,40 +1477,50 @@ def make_captions(sub_url, sub_langs): return captions # New captions format as of 22.06.2017 - player_response = args.get('player_response') - if player_response and isinstance(player_response, compat_str): - player_response = self._parse_json( - player_response, video_id, fatal=False) - if player_response: - renderer = player_response['captions']['playerCaptionsTracklistRenderer'] - caption_tracks = renderer['captionTracks'] - for caption_track in caption_tracks: - if 'kind' not in caption_track: - # not an automatic transcription - continue - base_url = caption_track['baseUrl'] - sub_lang_list = [] - for lang in renderer['translationLanguages']: - lang_code = lang.get('languageCode') - if lang_code: - sub_lang_list.append(lang_code) - return make_captions(base_url, sub_lang_list) + if "args" in player_config: + player_response = player_config["args"].get('player_response') + else: + # New player system (ytInitialPlayerResponse) as of October 2020 + player_response = player_config - self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) - return {} - # Some videos don't provide ttsurl but rather caption_tracks and - # caption_translation_languages (e.g. 20LmZk1hakA) - # Does not used anymore as of 22.06.2017 - caption_tracks = args['caption_tracks'] - caption_translation_languages = args['caption_translation_languages'] - caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - sub_lang_list = [] - for lang in caption_translation_languages.split(','): - lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) - sub_lang = lang_qs.get('lc', [None])[0] - if sub_lang: - sub_lang_list.append(sub_lang) - return make_captions(caption_url, sub_lang_list) + if player_response: + if isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + caption_tracks = renderer['captionTracks'] + for caption_track in caption_tracks: + if 'kind' not in caption_track: + # not an automatic transcription + continue + base_url = caption_track['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + + self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) + return {} + + if "args" in player_config: + args = player_config["args"] + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + sub_lang_list = [] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles except (KeyError, IndexError, ExtractorError): @@ -1784,21 +1796,24 @@ def extract_embedded_config(embed_webpage, video_id): # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) if ytplayer_config: - args = ytplayer_config['args'] - if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): - # Convert to the same format returned by compat_parse_qs - video_info = dict((k, [v]) for k, v in args.items()) - add_dash_mpd(video_info) - # Rental video is not rented but preview is available (e.g. - # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/ytdl-org/youtube-dl/issues/10532) - if not video_info and args.get('ypc_vid'): - return self.url_result( - args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) - if args.get('livestream') == '1' or args.get('live_playback') == 1: - is_live = True - if not player_response: - player_response = extract_player_response(args.get('player_response'), video_id) + args = ytplayer_config.get("args") + if args is not None: + if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): + # Convert to the same format returned by compat_parse_qs + video_info = dict((k, [v]) for k, v in args.items()) + add_dash_mpd(video_info) + # Rental video is not rented but preview is available (e.g. + # https://www.youtube.com/watch?v=yYr8q0y5Jfg, + # https://github.com/ytdl-org/youtube-dl/issues/10532) + if not video_info and args.get('ypc_vid'): + return self.url_result( + args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) + if args.get('livestream') == '1' or args.get('live_playback') == 1: + is_live = True + if not player_response: + player_response = extract_player_response(args.get('player_response'), video_id) + elif not player_response: + player_response = ytplayer_config if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) else: @@ -1828,8 +1843,8 @@ def extract_embedded_config(embed_webpage, video_id): age_gate = False # Try looking directly into the video webpage ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) - if ytplayer_config: - args = ytplayer_config['args'] + args = ytplayer_config.get("args") + if args is not None: if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'): # Convert to the same format returned by compat_parse_qs video_info = dict((k, [v]) for k, v in args.items()) @@ -1844,6 +1859,8 @@ def extract_embedded_config(embed_webpage, video_id): is_live = True if not player_response: player_response = extract_player_response(args.get('player_response'), video_id) + elif not player_response: + player_response = ytplayer_config if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) From e61f360157dfa51f2fd1cbc089c0c9a0680428a1 Mon Sep 17 00:00:00 2001 From: nixxo Date: Sat, 31 Oct 2020 14:52:07 +0100 Subject: [PATCH 03/49] [skyitalia] added geoblock msg --- youtube_dlc/extractor/skyitalia.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py index 3c7bd465df..22a6be2be7 100644 --- a/youtube_dlc/extractor/skyitalia.py +++ b/youtube_dlc/extractor/skyitalia.py @@ -13,6 +13,7 @@ class SkyItaliaBaseIE(InfoExtractor): 'high': [854, 480], 'hd': [1280, 720] } + _GEO_BYPASS = False def _extract_video_id(self, url): webpage = self._download_webpage(url, 'skyitalia') @@ -43,6 +44,9 @@ def _get_formats(self, video_id, token): 'height': r[1] }) + if not formats and video_data.get('geob') == 1: + self.raise_geo_restricted(countries=['IT']) + self._sort_formats(formats) title = video_data.get('title') thumb = video_data.get('thumb') From 167c108f7072a8392c509e5e8b9f84c0e0c0bb28 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Mon, 2 Nov 2020 08:52:55 +0100 Subject: [PATCH 04/49] [skip travis] --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 08bddaa187..5a26906aca 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ [![Build Status](https://travis-ci.com/blackjack4494/yt-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/yt-dlc) [![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc) -[![Downloads](https://pepy.tech/badge/youtube-dlc)](https://pepy.tech/project/youtube-dlc) [![Gitter chat](https://img.shields.io/gitter/room/youtube-dlc/community)](https://gitter.im/youtube-dlc) [![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/youtube-dlc/blob/master/LICENSE) From 5c15c1a0d7c27d34e7d03161c5b27bf923e314cd Mon Sep 17 00:00:00 2001 From: insaneracist Date: Mon, 2 Nov 2020 14:54:47 -0800 Subject: [PATCH 05/49] python2: don't use str, use compat_str --- youtube_dlc/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index ad67fa4100..d8f0dab1f5 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2813,11 +2813,11 @@ def extract_videos_from_page(self, page): def _extract_mix_ids_from_yt_initial(self, yt_initial): ids = [] - playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents']) - if type(playlist_contents) is list: + playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list) + if playlist_contents: for item in playlist_contents: - videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId']) - if type(videoId) is str: + videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str) + if videoId: ids.append(videoId) return ids From 0536e60b48041d9c7d9ce8bbbef0eb2131ce3919 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 14:18:27 -0500 Subject: [PATCH 06/49] [vlive] fix: extractor tests for VODs --- youtube_dlc/extractor/vlive.py | 79 +++++++++++++++++----------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index f79531e6f3..cc1d20a3a9 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -11,7 +11,6 @@ from ..utils import ( ExtractorError, merge_dicts, - remove_start, try_get, urlencode_postdata, ) @@ -97,49 +96,49 @@ def is_logged_in(): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - webpage = self._download_webpage( - 'https://www.vlive.tv/video/%s' % video_id, video_id) + PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*' + PARAMS_FIELD = 'params' - VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)' - VIDEO_PARAMS_FIELD = 'video params' + params = self._search_regex( + PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) + params = self._parse_json(params, video_id, fatal=False) - params = self._parse_json(self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id, - transform_source=lambda s: '[' + s + ']', fatal=False) + video_params = params["postDetail"]["post"].get("officialVideo") + if video_params is None: + raise ExtractorError('Invalid key: Failed to extract video parameters.') - if not params or len(params) < 7: - params = self._search_regex( - VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD) - params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)] + long_video_id = video_params["vodId"] + video_type = video_params["type"] + KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(KEY_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) + key = key_json["inkey"] - status, long_video_id, key = params[2], params[5], params[6] - status = remove_start(status, 'PRODUCT_') - - if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): - return self._live(video_id, webpage) - elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): - return self._replay(video_id, webpage, long_video_id, key) - - if status == 'LIVE_END': - raise ExtractorError('Uploading for replay. Please wait...', - expected=True) - elif status == 'COMING_SOON': - raise ExtractorError('Coming soon!', expected=True) - elif status == 'CANCELED': - raise ExtractorError('We are sorry, ' - 'but the live broadcast has been canceled.', - expected=True) - elif status == 'ONLY_APP': - raise ExtractorError('Unsupported video type', expected=True) + if video_type in ('VOD'): + encoding_status = video_params["encodingStatus"] + if encoding_status == 'COMPLETE': + return self._replay(video_id, webpage, long_video_id, key, params) + else: + raise ExtractorError('VOD encoding not yet complete. Please try again later.', + expected=True) + elif video_type in ('LIVE'): + video_status = video_params["status"] + if video_status == 'RESERVED': + raise ExtractorError('Coming soon!', expected=True) + else: + return self._live(video_id, webpage, params) else: - raise ExtractorError('Unknown status %s' % status) + raise ExtractorError('Unknown video type %s' % video_type) - def _get_common_fields(self, webpage): + def _get_common_fields(self, webpage, params): title = self._og_search_title(webpage) - creator = self._html_search_regex( - r']+class="info_area"[^>]*>\s*(?:]*>.*?\s*)?]*>([^<]+)', - webpage, 'creator', fatal=False) + description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, 'description', default=None) + creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str) + or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False)) thumbnail = self._og_search_thumbnail(webpage) return { 'title': title, @@ -147,7 +146,7 @@ def _get_common_fields(self, webpage): 'thumbnail': thumbnail, } - def _live(self, video_id, webpage): + def _live(self, video_id, webpage, params): init_page = self._download_init_page(video_id) live_params = self._search_regex( @@ -164,7 +163,7 @@ def _live(self, video_id, webpage): fatal=False, live=True)) self._sort_formats(formats) - info = self._get_common_fields(webpage) + info = self._get_common_fields(webpage, params) info.update({ 'title': self._live_title(info['title']), 'id': video_id, @@ -173,7 +172,7 @@ def _live(self, video_id, webpage): }) return info - def _replay(self, video_id, webpage, long_video_id, key): + def _replay(self, video_id, webpage, long_video_id, key, params): if '' in (long_video_id, key): init_page = self._download_init_page(video_id) video_info = self._parse_json(self._search_regex( @@ -186,7 +185,7 @@ def _replay(self, video_id, webpage, long_video_id, key): long_video_id, key = video_info['vid'], video_info['inkey'] return merge_dicts( - self._get_common_fields(webpage), + self._get_common_fields(webpage, params), self._extract_video_info(video_id, long_video_id, key)) def _download_init_page(self, video_id): From 5dcfd2508add09ab46d730f4802ce6da73edafaf Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 15:04:05 -0500 Subject: [PATCH 07/49] [vlive] add: support video post urls --- youtube_dlc/extractor/vlive.py | 41 +++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index cc1d20a3a9..abbcfb32bc 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -18,10 +18,10 @@ class VLiveIE(NaverBaseIE): IE_NAME = 'vlive' - _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P(?:\d-)?[0-9]+)' _NETRC_MACHINE = 'vlive' _TESTS = [{ - 'url': 'http://www.vlive.tv/video/1326', + 'url': 'https://www.vlive.tv/video/1326', 'md5': 'cc7314812855ce56de70a06a27314983', 'info_dict': { 'id': '1326', @@ -31,8 +31,21 @@ class VLiveIE(NaverBaseIE): 'view_count': int, 'uploader_id': 'muploader_a', }, - }, { - 'url': 'http://www.vlive.tv/video/16937', + }, + { + 'url': 'https://vlive.tv/post/1-18244258', + 'md5': 'cc7314812855ce56de70a06a27314983', + 'info_dict': { + 'id': '1326', + 'ext': 'mp4', + 'title': "[V LIVE] Girl's Day's Broadcast", + 'creator': "Girl's Day", + 'view_count': int, + 'uploader_id': 'muploader_a', + }, + }, + { + 'url': 'https://www.vlive.tv/video/16937', 'info_dict': { 'id': '16937', 'ext': 'mp4', @@ -95,24 +108,30 @@ def is_logged_in(): raise ExtractorError('Unable to log in', expected=True) def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + # url may match on a post or a video url with a post_id potentially matching a video_id + working_id = self._match_id(url) + webpage = self._download_webpage(url, working_id) PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*' PARAMS_FIELD = 'params' params = self._search_regex( PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) - params = self._parse_json(params, video_id, fatal=False) + params = self._parse_json(params, working_id, fatal=False) - video_params = params["postDetail"]["post"].get("officialVideo") + video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"]) if video_params is None: - raise ExtractorError('Invalid key: Failed to extract video parameters.') + if 'post' in url: + raise ExtractorError('Url does not appear to be a video post.') + else: + raise ExtractorError('Failed to extract video parameters.') + video_id = working_id if 'video' in url else str(video_params["videoSeq"]) long_video_id = video_params["vodId"] video_type = video_params["type"] - KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id - key_json = self._download_json(KEY_ENDPOINT, video_id, + + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, headers={"referer": "https://www.vlive.tv"}) key = key_json["inkey"] From 1923b146b378aed234f3cc91a61eb9c5aec2f684 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 15:40:47 -0500 Subject: [PATCH 08/49] [vlive] add: support new channel url format --- youtube_dlc/extractor/vlive.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index abbcfb32bc..98c405f219 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -220,15 +220,22 @@ def _download_init_page(self, video_id): class VLiveChannelIE(InfoExtractor): IE_NAME = 'vlive:channel' - _VALID_URL = r'https?://channels\.vlive\.tv/(?P[0-9A-Z]+)' - _TEST = { - 'url': 'http://channels.vlive.tv/FCD4B', + _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P[0-9A-Z]+)' + _TESTS = [{ + 'url': 'https://channels.vlive.tv/FCD4B', 'info_dict': { 'id': 'FCD4B', 'title': 'MAMAMOO', }, 'playlist_mincount': 110 - } + }, { + 'url': 'https://www.vlive.tv/channel/FCD4B', + 'info_dict': { + 'id': 'FCD4B', + 'title': 'MAMAMOO', + }, + 'playlist_mincount': 110 + }] _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' def _real_extract(self, url): From 8ba3ad0a48bcc2e12f2ed82c0c5e0999e5e94281 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 21:15:45 -0500 Subject: [PATCH 09/49] [vlive] fix: fetching live video not yet uploaded for replay --- youtube_dlc/extractor/vlive.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 98c405f219..70d5d8dfbc 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -144,8 +144,10 @@ def _real_extract(self, url): expected=True) elif video_type in ('LIVE'): video_status = video_params["status"] - if video_status == 'RESERVED': + if video_status in ('RESERVED'): raise ExtractorError('Coming soon!', expected=True) + elif video_status in ('ENDED', 'END'): + raise ExtractorError('Uploading for replay. Please wait...', expected=True) else: return self._live(video_id, webpage, params) else: From 341736255610aea3920d9e8bf627705fdb6756b1 Mon Sep 17 00:00:00 2001 From: exwm Date: Sun, 1 Nov 2020 21:26:17 -0500 Subject: [PATCH 10/49] [vlive] fix: vod logic wrongly used for live video --- youtube_dlc/extractor/vlive.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 70d5d8dfbc..5c8988c927 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -127,18 +127,12 @@ def _real_extract(self, url): raise ExtractorError('Failed to extract video parameters.') video_id = working_id if 'video' in url else str(video_params["videoSeq"]) - long_video_id = video_params["vodId"] + video_type = video_params["type"] - - VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id - key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, - headers={"referer": "https://www.vlive.tv"}) - key = key_json["inkey"] - if video_type in ('VOD'): encoding_status = video_params["encodingStatus"] if encoding_status == 'COMPLETE': - return self._replay(video_id, webpage, long_video_id, key, params) + return self._replay(video_id, webpage, params, video_params) else: raise ExtractorError('VOD encoding not yet complete. Please try again later.', expected=True) @@ -193,7 +187,13 @@ def _live(self, video_id, webpage, params): }) return info - def _replay(self, video_id, webpage, long_video_id, key, params): + def _replay(self, video_id, webpage, params, video_params): + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id + key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) + key = key_json["inkey"] + long_video_id = video_params["vodId"] + if '' in (long_video_id, key): init_page = self._download_init_page(video_id) video_info = self._parse_json(self._search_regex( From 73cc1b9125b5f2f80d777f746c16b5e73b92ddd5 Mon Sep 17 00:00:00 2001 From: exwm Date: Mon, 2 Nov 2020 12:19:16 -0500 Subject: [PATCH 11/49] [vlive] fix: live video extractor * use live video info endpoint from v3 api --- youtube_dlc/extractor/vlive.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 5c8988c927..874f5203e5 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -162,19 +162,16 @@ def _get_common_fields(self, webpage, params): } def _live(self, video_id, webpage, params): - init_page = self._download_init_page(video_id) + LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id + play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, + headers={"referer": "https://www.vlive.tv"}) - live_params = self._search_regex( - r'"liveStreamInfo"\s*:\s*(".*"),', - init_page, 'live stream info') - live_params = self._parse_json(live_params, video_id) - live_params = self._parse_json(live_params, video_id) + streams = try_get(play_info, lambda x: x["result"]["streamList"]) or [] formats = [] - for vid in live_params.get('resolutions', []): + for stream in streams: formats.extend(self._extract_m3u8_formats( - vid['cdnUrl'], video_id, 'mp4', - m3u8_id=vid.get('name'), + stream['serviceUrl'], video_id, 'mp4', fatal=False, live=True)) self._sort_formats(formats) From 130599af9476284e7f0b3be4f68a0ff8346fb6ea Mon Sep 17 00:00:00 2001 From: exwm Date: Mon, 2 Nov 2020 18:34:54 -0500 Subject: [PATCH 12/49] [vlive] fix: raise login required error on vlive+ --- youtube_dlc/extractor/vlive.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 874f5203e5..38d78eda16 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -120,8 +120,15 @@ def _real_extract(self, url): params = self._parse_json(params, working_id, fatal=False) video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"]) + if video_params is None: - if 'post' in url: + error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"]) + product_type = try_get(error_data, + [lambda x: x["officialVideo"]["productType"], + lambda x: x["board"]["boardType"]]) + if product_type in ('VLIVE_PLUS', 'VLIVE+'): + self.raise_login_required('This video is only available for VLIVE+ subscribers') + elif 'post' in url: raise ExtractorError('Url does not appear to be a video post.') else: raise ExtractorError('Failed to extract video parameters.') @@ -191,17 +198,6 @@ def _replay(self, video_id, webpage, params, video_params): key = key_json["inkey"] long_video_id = video_params["vodId"] - if '' in (long_video_id, key): - init_page = self._download_init_page(video_id) - video_info = self._parse_json(self._search_regex( - (r'(?s)oVideoStatus\s*=\s*({.+?})\s* Date: Sun, 12 Apr 2020 23:27:58 +0200 Subject: [PATCH 13/49] [zoomus] Add new extractor --- youtube_dlc/extractor/extractors.py | 1 + youtube_dlc/extractor/zoomus.py | 51 +++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 youtube_dlc/extractor/zoomus.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 666134d868..34a8cecd5c 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1544,4 +1544,5 @@ ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE +from .zoomus import ZoomUSIE from .zype import ZypeIE diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py new file mode 100644 index 0000000000..150dbced70 --- /dev/null +++ b/youtube_dlc/extractor/zoomus.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, + url_or_none, +) + + +class ZoomUSIE(InfoExtractor): + IE_NAME = 'zoom.us' + _VALID_URL = r'https://zoom.us/recording/play/(?P.*)' + + _TESTS = [{ + 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', + 'info_dict': { + 'ext': 'mp4', + 'topic': "GAZ Transformational Tuesdays W/ Landon & Stapes", + 'recordFileName': "Shared screen with speaker view", + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + #cookie = self._get_cookies(url)['_zm_ssid'] + + video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') + topic = self._search_regex(r"topic: \"(.*)\",", webpage, 'video url') + viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (.*),", webpage, 'res width') + viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (.*),", webpage, 'res width') + + formats = [] + formats.append({ + 'url': video_url, + 'width': int_or_none(viewResolvtionsWidth), + 'height': int_or_none(viewResolvtionsHeight), + 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', + 'Referer': 'https://zoom.us/', + } + }) + self._sort_formats(formats) + + return { + 'id': display_id, + 'title': topic, + 'formats': formats + } \ No newline at end of file From ef6be42014694bf67afb38b19e951180a5d0e9fb Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Sun, 12 Apr 2020 23:40:00 +0200 Subject: [PATCH 14/49] [zoomus] Allow for more urls --- youtube_dlc/extractor/zoomus.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index 150dbced70..cdcf026e81 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -12,7 +12,7 @@ class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' - _VALID_URL = r'https://zoom.us/recording/play/(?P.*)' + _VALID_URL = r'https://(.*).?zoom.us/rec(ording)?/play/(?P.*)' _TESTS = [{ 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', @@ -26,7 +26,6 @@ class ZoomUSIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - #cookie = self._get_cookies(url)['_zm_ssid'] video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') topic = self._search_regex(r"topic: \"(.*)\",", webpage, 'video url') From 55cd2999edad0c9b148d5e9334a74be55bdb668c Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Mon, 13 Apr 2020 00:18:40 +0200 Subject: [PATCH 15/49] [zoomus] Cleanup --- youtube_dlc/extractor/zoomus.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index cdcf026e81..a0e34801f2 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -4,9 +4,6 @@ from .common import InfoExtractor from ..utils import ( int_or_none, - parse_iso8601, - try_get, - url_or_none, ) @@ -14,14 +11,15 @@ class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' _VALID_URL = r'https://(.*).?zoom.us/rec(ording)?/play/(?P.*)' - _TESTS = [{ + _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', 'info_dict': { - 'ext': 'mp4', - 'topic': "GAZ Transformational Tuesdays W/ Landon & Stapes", - 'recordFileName': "Shared screen with speaker view", + 'md5': '031a5b379f1547a8b29c5c4c837dccf2', + 'title': "GAZ Transformational Tuesdays W/ Landon & Stapes", + 'id': "SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK", + 'ext': "mp4", } - }] + } def _real_extract(self, url): display_id = self._match_id(url) @@ -37,9 +35,8 @@ def _real_extract(self, url): 'url': video_url, 'width': int_or_none(viewResolvtionsWidth), 'height': int_or_none(viewResolvtionsHeight), - 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', - 'Referer': 'https://zoom.us/', - } + 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', + 'Referer': 'https://zoom.us/'} }) self._sort_formats(formats) @@ -47,4 +44,4 @@ def _real_extract(self, url): 'id': display_id, 'title': topic, 'formats': formats - } \ No newline at end of file + } From abd273e17bb324296a81ea82be398e478ecdfa60 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Mon, 13 Apr 2020 07:27:56 +0200 Subject: [PATCH 16/49] [zoomus] coding conventions --- youtube_dlc/extractor/zoomus.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index a0e34801f2..75a1b63757 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -4,12 +4,14 @@ from .common import InfoExtractor from ..utils import ( int_or_none, + url_or_none, + parse_filesize ) class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' - _VALID_URL = r'https://(.*).?zoom.us/rec(ording)?/play/(?P.*)' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P[^?&=]{64})' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', @@ -17,31 +19,33 @@ class ZoomUSIE(InfoExtractor): 'md5': '031a5b379f1547a8b29c5c4c837dccf2', 'title': "GAZ Transformational Tuesdays W/ Landon & Stapes", 'id': "SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK", - 'ext': "mp4", + 'ext': "mp4" } } def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') - topic = self._search_regex(r"topic: \"(.*)\",", webpage, 'video url') - viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (.*),", webpage, 'res width') - viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (.*),", webpage, 'res width') + title = self._html_search_regex([r"topic: \"(.*)\",", r"(.*) - Zoom"], webpage, 'title') + viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (\d*)", webpage, 'res width', fatal=False) + viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (\d*)", webpage, 'res height', fatal=False) + fileSize = parse_filesize(self._search_regex(r"fileSize: \'(.+)\'", webpage, 'fileSize', fatal=False)) formats = [] formats.append({ - 'url': video_url, + 'url': url_or_none(video_url), 'width': int_or_none(viewResolvtionsWidth), 'height': int_or_none(viewResolvtionsHeight), 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', - 'Referer': 'https://zoom.us/'} + 'Referer': 'https://zoom.us/'}, + 'ext': "mp4", + 'filesize_approx': int_or_none(fileSize) }) self._sort_formats(formats) return { 'id': display_id, - 'title': topic, + 'title': title, 'formats': formats } From 81acad1279c59edf63ceb3348437521715276210 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Mon, 20 Apr 2020 16:20:54 +0200 Subject: [PATCH 17/49] [zoomus] Added support for password protected videos --- youtube_dlc/extractor/zoomus.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index 75a1b63757..eb8b0fd0cc 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -3,9 +3,11 @@ from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, url_or_none, - parse_filesize + parse_filesize, + urlencode_postdata ) @@ -26,6 +28,12 @@ class ZoomUSIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + + password_protected = self._search_regex(r']+?id="(password_form)"', webpage, 'password field', fatal=False) + if password_protected is not None: + self._verify_video_password(url, display_id, webpage) + webpage = self._download_webpage(url, display_id) + video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url') title = self._html_search_regex([r"topic: \"(.*)\",", r"(.*) - Zoom"], webpage, 'title') viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (\d*)", webpage, 'res width', fatal=False) @@ -49,3 +57,24 @@ def _real_extract(self, url): 'title': title, 'formats': formats } + + def _verify_video_password(self, url, video_id, webpage): + password = self._downloader.params.get('videopassword') + if password is None: + raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True) + meetId = self._search_regex(r']+?id="meetId" value="([^\"]+)"', webpage, 'meetId') + data = urlencode_postdata({ + 'id': meetId, + 'passwd': password, + 'action': "viewdetailedpage", + 'recaptcha': "" + }) + validation_url = url.split("zoom.us")[0]+"zoom.us/rec/validate_meet_passwd" + validation_response = self._download_json( + validation_url, video_id, + note='Validating Password...', + errnote='Wrong password?', + data=data) + + if validation_response['errorCode'] != 0: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, validation_response['errorMessage'])) From aa13f124a5afcca3af3086ab7bcdc74783a95127 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Tue, 21 Apr 2020 09:48:35 +0200 Subject: [PATCH 18/49] [zoomus] Adjusted referer header, fixed formating for flake8 --- youtube_dlc/extractor/zoomus.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index eb8b0fd0cc..f61f35da8c 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -40,13 +40,15 @@ def _real_extract(self, url): viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (\d*)", webpage, 'res height', fatal=False) fileSize = parse_filesize(self._search_regex(r"fileSize: \'(.+)\'", webpage, 'fileSize', fatal=False)) + urlprefix = url.split("zoom.us")[0] + "zoom.us/" + formats = [] formats.append({ 'url': url_or_none(video_url), 'width': int_or_none(viewResolvtionsWidth), 'height': int_or_none(viewResolvtionsHeight), 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', - 'Referer': 'https://zoom.us/'}, + 'Referer': urlprefix}, 'ext': "mp4", 'filesize_approx': int_or_none(fileSize) }) @@ -69,7 +71,7 @@ def _verify_video_password(self, url, video_id, webpage): 'action': "viewdetailedpage", 'recaptcha': "" }) - validation_url = url.split("zoom.us")[0]+"zoom.us/rec/validate_meet_passwd" + validation_url = url.split("zoom.us")[0] + "zoom.us/rec/validate_meet_passwd" validation_response = self._download_json( validation_url, video_id, note='Validating Password...', From b11a88fc243a078c2addbcf0d1377bd65495bc05 Mon Sep 17 00:00:00 2001 From: Roman Sebastian Karwacik Date: Tue, 2 Jun 2020 13:07:10 +0200 Subject: [PATCH 19/49] [zoomus] Adjusted url regex, now allowing for arbitrary long ids, dont throw warning if password field not found --- youtube_dlc/extractor/zoomus.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoomus.py index f61f35da8c..9aae30d373 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoomus.py @@ -13,7 +13,7 @@ class ZoomUSIE(InfoExtractor): IE_NAME = 'zoom.us' - _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P[^?&=]{64})' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P[A-Za-z0-9\-_]+)' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', @@ -29,7 +29,7 @@ def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - password_protected = self._search_regex(r']+?id="(password_form)"', webpage, 'password field', fatal=False) + password_protected = self._search_regex(r']+?id="(password_form)"', webpage, 'password field', fatal=False, default=None) if password_protected is not None: self._verify_video_password(url, display_id, webpage) webpage = self._download_webpage(url, display_id) From 471115dbeefb899ee036d3e769da1f90070664b6 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Tue, 3 Nov 2020 10:31:31 +0100 Subject: [PATCH 20/49] [skip travis] add option to use pip to use master --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5a26906aca..83e51f68b4 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,10 @@ # INSTALLATION python -m pip install --upgrade youtube-dlc +If you want to install the current master branch + + python -m pip install git+https://github.com/blackjack4494/yt-dlc + **UNIX** (Linux, macOS, etc.) Using wget: From 15f6397c197af9ad464b2c385e3c8d4192aadb07 Mon Sep 17 00:00:00 2001 From: insaneracist Date: Tue, 3 Nov 2020 07:15:16 -0800 Subject: [PATCH 21/49] [youtube] get mix playlist title from ytInitialData --- youtube_dlc/extractor/youtube.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d8f0dab1f5..d736daa40c 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2825,6 +2825,7 @@ def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id ids = [] + yt_initial = None last_id = playlist_id[-11:] for n in itertools.count(1): url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) @@ -2858,6 +2859,9 @@ def _extract_mix(self, playlist_id): or search_title('title')) title = clean_html(title_span) + if not title: + title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str) + return self.playlist_result(url_results, playlist_id, title) def _extract_playlist(self, playlist_id): From be5d6c213cc68ab0ae3764db7c3fd9ed128b3ff3 Mon Sep 17 00:00:00 2001 From: exwm Date: Tue, 3 Nov 2020 20:59:23 -0500 Subject: [PATCH 22/49] [vlive] refactor: delete dead function code --- youtube_dlc/extractor/vlive.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 38d78eda16..a205af921b 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -202,16 +202,6 @@ def _replay(self, video_id, webpage, params, video_params): self._get_common_fields(webpage, params), self._extract_video_info(video_id, long_video_id, key)) - def _download_init_page(self, video_id): - return self._download_webpage( - 'https://www.vlive.tv/video/init/view', - video_id, note='Downloading live webpage', - data=urlencode_postdata({'videoSeq': video_id}), - headers={ - 'Referer': 'https://www.vlive.tv/video/%s' % video_id, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - class VLiveChannelIE(InfoExtractor): IE_NAME = 'vlive:channel' From c434e9f504ed93ae851ff6b6b46051c91b0ec213 Mon Sep 17 00:00:00 2001 From: exwm Date: Tue, 3 Nov 2020 21:05:19 -0500 Subject: [PATCH 23/49] [vlive] fix: missing expected types for try_get --- youtube_dlc/extractor/vlive.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index a205af921b..fe9788d8f7 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -119,13 +119,14 @@ def _real_extract(self, url): PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) params = self._parse_json(params, working_id, fatal=False) - video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"]) + video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) if video_params is None: - error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"]) + error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"], dict) product_type = try_get(error_data, [lambda x: x["officialVideo"]["productType"], - lambda x: x["board"]["boardType"]]) + lambda x: x["board"]["boardType"]], + compat_str) if product_type in ('VLIVE_PLUS', 'VLIVE+'): self.raise_login_required('This video is only available for VLIVE+ subscribers') elif 'post' in url: @@ -173,7 +174,7 @@ def _live(self, video_id, webpage, params): play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, headers={"referer": "https://www.vlive.tv"}) - streams = try_get(play_info, lambda x: x["result"]["streamList"]) or [] + streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] formats = [] for stream in streams: From 9c8bc84fd2000a90418aae17d89eb20f2418f54b Mon Sep 17 00:00:00 2001 From: exwm Date: Tue, 3 Nov 2020 21:27:49 -0500 Subject: [PATCH 24/49] [vlive] add: improved video extractor errors --- youtube_dlc/extractor/vlive.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index fe9788d8f7..935560b576 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -122,15 +122,24 @@ def _real_extract(self, url): video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) if video_params is None: - error_data = try_get(params, lambda x: x["postDetail"]["error"]["data"], dict) + error = try_get(params, lambda x: x["postDetail"]["error"], dict) + error_data = try_get(error, lambda x: x["data"], dict) + error_video = try_get(error_data, lambda x: x["officialVideo"], dict) + error_msg = try_get(error, lambda x: x["message"], compat_str) product_type = try_get(error_data, [lambda x: x["officialVideo"]["productType"], lambda x: x["board"]["boardType"]], compat_str) - if product_type in ('VLIVE_PLUS', 'VLIVE+'): - self.raise_login_required('This video is only available for VLIVE+ subscribers') + + if error_video is not None: + if product_type in ('VLIVE_PLUS', 'VLIVE+'): + self.raise_login_required('This video is only available with V LIVE+.') + elif error_msg is not None: + raise ExtractorError('V LIVE reported the following error: %s' % error_msg) + else: + raise ExtractorError('Failed to extract video parameters.') elif 'post' in url: - raise ExtractorError('Url does not appear to be a video post.') + raise ExtractorError('Url does not appear to be a video post.', expected=True) else: raise ExtractorError('Failed to extract video parameters.') @@ -193,11 +202,12 @@ def _live(self, video_id, webpage, params): return info def _replay(self, video_id, webpage, params, video_params): + long_video_id = video_params["vodId"] + VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, headers={"referer": "https://www.vlive.tv"}) key = key_json["inkey"] - long_video_id = video_params["vodId"] return merge_dicts( self._get_common_fields(webpage, params), From ab36800b1fc7c17ab587bfe8015a0260db635efb Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 4 Nov 2020 18:14:02 +0100 Subject: [PATCH 25/49] [la7] fix missing protocol --- youtube_dlc/extractor/la7.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/youtube_dlc/extractor/la7.py b/youtube_dlc/extractor/la7.py index f5d4564faf..74b006fb50 100644 --- a/youtube_dlc/extractor/la7.py +++ b/youtube_dlc/extractor/la7.py @@ -36,6 +36,9 @@ class LA7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + if not url.startswith('http'): + url = '%s//%s' % (self.http_scheme(), url) + webpage = self._download_webpage(url, video_id) player_data = self._search_regex( From 659ddd7f7055baa8742433c2b73f01b3a1e2505f Mon Sep 17 00:00:00 2001 From: insaneracist Date: Wed, 4 Nov 2020 10:06:53 -0800 Subject: [PATCH 26/49] [youtube] fix: Youtube Music playlists --- youtube_dlc/extractor/youtube.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 2e70ad6fa1..d6550a7766 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2965,9 +2965,12 @@ def _real_extract(self, url): if video: return video + youtube_music_playlist_prefix = 'RDCLAK5uy_' if playlist_id.startswith(('RD', 'UL', 'PU')): - # Mixes require a custom extraction process - return self._extract_mix(playlist_id) + if not playlist_id.startswith(youtube_music_playlist_prefix): + # Mixes require a custom extraction process, + # Youtube Music playlists act like normal playlists (with randomized order) + return self._extract_mix(playlist_id) has_videos, playlist = self._extract_playlist(playlist_id) if has_videos or not video_id: From 7f4f0b21c26b59a1d621e6407ea2f4ed6c1a98be Mon Sep 17 00:00:00 2001 From: insaneracist Date: Wed, 4 Nov 2020 12:00:51 -0800 Subject: [PATCH 27/49] [youtube] added Youtube Music channel info --- youtube_dlc/extractor/youtube.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d6550a7766..cd4e844a02 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -2631,6 +2631,12 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P\d+))?(?:[^>]+>(?P[^<]+))?)?' _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' + _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_' + _YTM_CHANNEL_INFO = { + 'uploader': 'Youtube Music', + 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ" + 'uploader_url': 'https://www.youtube.com/music' + } _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { @@ -2936,6 +2942,8 @@ def _extract_playlist(self, playlist_id): 'uploader_id': uploader_id, 'uploader_url': uploader_url, }) + if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): + playlist.update(self._YTM_CHANNEL_INFO) return has_videos, playlist @@ -2965,9 +2973,8 @@ def _real_extract(self, url): if video: return video - youtube_music_playlist_prefix = 'RDCLAK5uy_' if playlist_id.startswith(('RD', 'UL', 'PU')): - if not playlist_id.startswith(youtube_music_playlist_prefix): + if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): # Mixes require a custom extraction process, # Youtube Music playlists act like normal playlists (with randomized order) return self._extract_mix(playlist_id) From 366a7a4753944802ed88638decd683f7472de53e Mon Sep 17 00:00:00 2001 From: insaneracist <insaneracist@cyberdude.com> Date: Wed, 4 Nov 2020 12:13:51 -0800 Subject: [PATCH 28/49] [zoom] rename extractor from zoomus --- youtube_dlc/extractor/extractors.py | 2 +- youtube_dlc/extractor/{zoomus.py => zoom.py} | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) rename youtube_dlc/extractor/{zoomus.py => zoom.py} (98%) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 34a8cecd5c..24c1075987 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1544,5 +1544,5 @@ ) from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE -from .zoomus import ZoomUSIE +from .zoom import ZoomIE from .zype import ZypeIE diff --git a/youtube_dlc/extractor/zoomus.py b/youtube_dlc/extractor/zoom.py similarity index 98% rename from youtube_dlc/extractor/zoomus.py rename to youtube_dlc/extractor/zoom.py index 9aae30d373..003e1f901d 100644 --- a/youtube_dlc/extractor/zoomus.py +++ b/youtube_dlc/extractor/zoom.py @@ -11,8 +11,8 @@ ) -class ZoomUSIE(InfoExtractor): - IE_NAME = 'zoom.us' +class ZoomIE(InfoExtractor): + IE_NAME = 'zoom' _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P<id>[A-Za-z0-9\-_]+)' _TEST = { From 503d4a44f65146a63bf1bd5c04ac510a04fe0d33 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Thu, 5 Nov 2020 01:47:52 +0530 Subject: [PATCH 29/49] Don't try to embed/convert json subtitles generated by youtube livechat --- youtube_dlc/postprocessor/ffmpeg.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/postprocessor/ffmpeg.py b/youtube_dlc/postprocessor/ffmpeg.py index 5e85f4eebd..c38db31430 100644 --- a/youtube_dlc/postprocessor/ffmpeg.py +++ b/youtube_dlc/postprocessor/ffmpeg.py @@ -412,7 +412,9 @@ def run(self, information): for lang, sub_info in subtitles.items(): sub_ext = sub_info['ext'] - if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': + if sub_ext == 'json': + self._downloader.to_screen('[ffmpeg] JSON subtitles cannot be embedded') + elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt': sub_langs.append(lang) sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext)) else: @@ -643,13 +645,18 @@ def run(self, info): self._downloader.to_screen( '[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext) continue + elif ext == 'json': + self._downloader.to_screen( + '[ffmpeg] You have requested to convert json subtitles into another format, ' + 'which is currently not possible') + continue old_file = subtitles_filename(filename, lang, ext, info.get('ext')) sub_filenames.append(old_file) new_file = subtitles_filename(filename, lang, new_ext, info.get('ext')) if ext in ('dfxp', 'ttml', 'tt'): self._downloader.report_warning( - 'You have requested to convert dfxp (TTML) subtitles into another format, ' + '[ffmpeg] You have requested to convert dfxp (TTML) subtitles into another format, ' 'which results in style information loss') dfxp_file = old_file From 8abd647c59c9eb8f0fefd2b329e62b2b32bac6ea Mon Sep 17 00:00:00 2001 From: nixxo <c.nixxo@gmail.com> Date: Thu, 5 Nov 2020 20:52:28 +0100 Subject: [PATCH 30/49] [mailru] removed escaped braces, use urljoin, added tests --- youtube_dlc/extractor/mailru.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/mailru.py b/youtube_dlc/extractor/mailru.py index 6fdf70aa68..5bfe40649d 100644 --- a/youtube_dlc/extractor/mailru.py +++ b/youtube_dlc/extractor/mailru.py @@ -12,6 +12,7 @@ parse_duration, remove_end, try_get, + urljoin, ) @@ -93,6 +94,14 @@ class MailRuIE(InfoExtractor): { 'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html', 'only_matching': True, + }, + { + 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009', + 'only_matching': True, + }, + { + 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html', + 'only_matching': True, } ] @@ -110,7 +119,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) page_config = self._parse_json(self._search_regex([ r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', - r'(?s)"video":\s*(\{.+?\}),'], + r'(?s)"video":\s*({.+?}),'], webpage, 'page config', default='{}'), video_id, fatal=False) if page_config: meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl') @@ -121,7 +130,7 @@ def _real_extract(self, url): # fix meta_url if missing the host address if re.match(r'^\/\+\/', meta_url): - meta_url = 'https://my.mail.ru' + meta_url + meta_url = urljoin('https://my.mail.ru', meta_url) if meta_url: video_data = self._download_json( From 5db4014b2367317fc6875aeb8fddc374b5225074 Mon Sep 17 00:00:00 2001 From: Unknown <blackjack4494@web.de> Date: Sat, 7 Nov 2020 15:05:05 +0100 Subject: [PATCH 31/49] [skip travis] readme and pypi update --- README.md | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 83e51f68b4..f884ad067e 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,14 @@ [![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc) [![Gitter chat](https://img.shields.io/gitter/room/youtube-dlc/community)](https://gitter.im/youtube-dlc) -[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/youtube-dlc/blob/master/LICENSE) +[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/yt-dlc/blob/master/LICENSE) youtube-dlc - download videos from youtube.com or other video platforms. youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://web.archive.org/web/20201014194602/https://github.com/ytdl-org/youtube-dl/issues/26462) - [INSTALLATION](#installation) +- [UPDATE](#update) - [DESCRIPTION](#description) - [OPTIONS](#options) - [Network Options:](#network-options) diff --git a/setup.py b/setup.py index a10ef0a771..6908f24045 100644 --- a/setup.py +++ b/setup.py @@ -66,7 +66,7 @@ def run(self): description=DESCRIPTION, long_description=LONG_DESCRIPTION, # long_description_content_type="text/markdown", - url="https://github.com/blackjack4494/youtube-dlc", + url="https://github.com/blackjack4494/yt-dlc", packages=find_packages(exclude=("youtube_dl","test",)), #packages=[ # 'youtube_dlc', From 5943bb6214eca0a4aebb223d5a5800e3a024ae35 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel <github@tom-oliver.eu> Date: Sat, 7 Nov 2020 16:00:01 +0100 Subject: [PATCH 32/49] [skip travis] update workflow - sha file --- .github/workflows/build.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index f5d94dc490..cc344f6014 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -109,7 +109,7 @@ jobs: runs-on: windows-latest - needs: build_unix + needs: [build_unix, build_windows] steps: - uses: actions/checkout@v2 @@ -146,10 +146,10 @@ jobs: SHA2_UNIX: ${{ needs.build_unix.outputs.sha2_unix }} YTDLC_VERSION: ${{ needs.build_unix.outputs.ytdlc_version }} run: | - echo "version:$YTDLC_VERSION" >> SHA2-256SUMS - echo "youtube-dlc.exe:$SHA2_WINDOWS" >> SHA2-256SUMS - echo "youtube-dlc32.exe:$SHA2_WINDOWS32" >> SHA2-256SUMS - echo "youtube-dlc:$SHA2_UNIX" >> SHA2-256SUMS + echo "version:${env:YTDLC_VERSION}" >> SHA2-256SUMS + echo "youtube-dlc.exe:${env:SHA2_WINDOWS}" >> SHA2-256SUMS + echo "youtube-dlc_x86.exe:${env:SHA2_WINDOWS32}" >> SHA2-256SUMS + echo "youtube-dlc:${env:SHA2_UNIX}" >> SHA2-256SUMS - name: Upload 256SUMS file id: upload-sums From b860e4cc2f53c7858054f73928f51188ea6b49b8 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA <nico@ByMe.at> Date: Sun, 8 Nov 2020 08:36:26 +0100 Subject: [PATCH 33/49] [common] Make sure self.params.get('sleep_interval_subtitles') is int This can happen if another software is using yt-dlc'API (ie: tubeup). The stack trace would be: $ tubeup 'https://youtube.com/watch?v=JyE9OF03cao' [debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8 [debug] youtube-dlc version 2020.10.25 [debug] Python version 3.7.3 (CPython) - Linux-5.8.0-0.bpo.2-amd64-x86_64-with-debian-10.6 [debug] exe versions: ffmpeg 3.3.9, ffprobe 3.3.9 [debug] Proxy map: {} There are no annotations to write. ERROR: '>' not supported between instances of 'NoneType' and 'int' Traceback (most recent call last): File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 846, in extract_info return self.process_ie_result(ie_result, download, extra_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 901, in process_ie_result return self.process_video_result(ie_result, download=download) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1696, in process_video_result self.process_info(new_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1894, in process_info dl(sub_filename, sub_info, subtitle=True) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1866, in dl return fd.download(name, info, subtitle) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/common.py", line 367, in download if self.params.get('sleep_interval_subtitles') > 0: TypeError: '>' not supported between instances of 'NoneType' and 'int' --- youtube_dlc/downloader/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index c65500d613..7d303be1cf 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -364,8 +364,10 @@ def download(self, filename, info_dict, subtitle=False): else '%.2f' % sleep_interval)) time.sleep(sleep_interval) else: - if self.params.get('sleep_interval_subtitles') > 0: + sleep_interval_sub = 0 + if type(self.params.get('sleep_interval_subtitles')) is int: sleep_interval_sub = self.params.get('sleep_interval_subtitles') + if sleep_interval_sub > 0: self.to_screen( '[download] Sleeping %s seconds...' % ( sleep_interval_sub)) From 8263104fe4f7aed96a1cc92be6b58cc219de876e Mon Sep 17 00:00:00 2001 From: Nicolas SAPA <nico@ByMe.at> Date: Sun, 8 Nov 2020 08:49:03 +0100 Subject: [PATCH 34/49] [youtube] Fix 'liveChatReplayContinuationData' missing 'continuation' key live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] can not exist. So catch the KeyError. Traceback: $ tubeup 'https://youtube.com/watch?v=JyE9OF03cao' [debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8 [debug] youtube-dlc version 2020.10.25 [debug] Python version 3.7.3 (CPython) - Linux-5.8.0-0.bpo.2-amd64-x86_64-with-debian-10.6 [debug] exe versions: ffmpeg 3.3.9, ffprobe 3.3.9 [debug] Proxy map: {} There are no annotations to write. [download] 452.59KiB at 615.35KiB/s (00:01)ERROR: 'liveChatReplayContinuationData' Traceback (most recent call last): File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 846, in extract_info return self.process_ie_result(ie_result, download, extra_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 901, in process_ie_result return self.process_video_result(ie_result, download=download) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1696, in process_video_result self.process_info(new_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1894, in process_info dl(sub_filename, sub_info, subtitle=True) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1866, in dl return fd.download(name, info, subtitle) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/common.py", line 375, in download return self.real_download(filename, info_dict) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/youtube_live_chat.py", line 85, in real_download continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] KeyError: 'liveChatReplayContinuationData' --- youtube_dlc/downloader/youtube_live_chat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/downloader/youtube_live_chat.py b/youtube_dlc/downloader/youtube_live_chat.py index 4932dd9c52..b333afa5bd 100644 --- a/youtube_dlc/downloader/youtube_live_chat.py +++ b/youtube_dlc/downloader/youtube_live_chat.py @@ -82,7 +82,10 @@ def parse_yt_initial_data(data): offset = int(replay_chat_item_action['videoOffsetTimeMsec']) processed_fragment.extend( json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') - continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + try: + continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + except KeyError: + continuation_id = None self._append_fragment(ctx, processed_fragment) From 876f1c17fff194cbed3595bb2a8497ea9e479bf7 Mon Sep 17 00:00:00 2001 From: Ali Sherief <alihsherief@linuxmail.org> Date: Mon, 9 Nov 2020 16:06:48 +0000 Subject: [PATCH 35/49] Fix #93 YoutubePlaylistsIE --- youtube_dlc/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3ec2581dc8..35ac67b492 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -300,11 +300,12 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page + mobj_reg = r'(?:(?:data-uix-load-more-href="[^"]+?;continuation=)|(?:"continuation":"))(?P<more>[^"]+)"' for page_num in itertools.count(1): for entry in self._process_page(content_html): yield entry - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + mobj = re.search(mobj_reg, more_widget_html) if not mobj: break @@ -315,7 +316,7 @@ def _entries(self, page, playlist_id): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/browse_ajax?ctoken=%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -372,7 +373,7 @@ def extract_videos_from_page(self, page): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + r'"/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') From 142f2c8e99e61054d3354bd915a9e46cbd80c8ea Mon Sep 17 00:00:00 2001 From: Robin Dunn <> Date: Mon, 9 Nov 2020 15:24:42 -0800 Subject: [PATCH 36/49] fall-back to the old way to fetch subtitles, if needed --- youtube_dlc/extractor/viki.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 0f188f84d1..6bddf8be9d 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -308,17 +308,26 @@ def _real_extract(self, url): 'url': thumbnail.get('url'), }) - new_video = self._download_json( - 'https://www.viki.com/api/videos/%s' % video_id, video_id, - 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) - subtitles = {} - for sub in new_video.get('streamSubtitles').get('dash'): - subtitles[sub.get('srclang')] = [{ - 'ext': 'vtt', - 'url': sub.get('src'), - 'completion': sub.get('percentage'), - }] + try: + # New way to fetch subtitles + new_video = self._download_json( + 'https://www.viki.com/api/videos/%s' % video_id, video_id, + 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) + for sub in new_video.get('streamSubtitles').get('dash'): + subtitles[sub.get('srclang')] = [{ + 'ext': 'vtt', + 'url': sub.get('src'), + 'completion': sub.get('percentage'), + }] + except AttributeError: + # fall-back to the old way if there isn't a streamSubtitles attribute + for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': subtitles_format, + 'url': self._prepare_call( + 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + } for subtitles_format in ('srt', 'vtt')] result = { 'id': video_id, From da8fb75df5aa3a6bdda2afbe7bec7da905f0618a Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel <github@tom-oliver.eu> Date: Tue, 10 Nov 2020 01:19:33 +0100 Subject: [PATCH 37/49] [skip travis] adjust python versions --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 74b50ecca0..4920a30b80 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -82,7 +82,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install Requirements run: pip install pyinstaller - name: Bump version @@ -116,7 +116,7 @@ jobs: - name: Set up Python 3.5.4 32-Bit uses: actions/setup-python@v2 with: - python-version: '3.5.4' + python-version: '3.4.4' architecture: 'x86' - name: Install Requirements for 32 Bit run: pip install pyinstaller==3.5 From 9833e7a015ca788a4f881c8ee945967b5f3d71bc Mon Sep 17 00:00:00 2001 From: Luc Ritchie <luc.ritchie@gmail.com> Date: Tue, 10 Nov 2020 03:38:26 -0500 Subject: [PATCH 38/49] fix: youtube: Polymer UI and JSON endpoints for playlists We already had a few copies of Polymer-style pagination handling logic for certain circumstances, but now we're forced into using it for all playlists since we can no longer disable Polymer. Refactor the logic to move it to the parent class for all entry lists (including e.g. search results, feeds, and list of playlists), and generify a bit to cover the child classes' use cases. --- youtube_dlc/extractor/youtube.py | 280 ++++++++++++++----------------- 1 file changed, 126 insertions(+), 154 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3ec2581dc8..273d823c22 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -36,6 +36,7 @@ get_element_by_attribute, get_element_by_id, int_or_none, + js_to_json, mimetype2ext, orderedSet, parse_codecs, @@ -70,6 +71,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' + _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)" _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -274,7 +277,6 @@ def warn(message): def _download_webpage_handle(self, *args, **kwargs): query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) @@ -297,15 +299,60 @@ def _real_initialize(self): class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button + def _find_entries_in_json(self, extracted): + entries = [] + c = {} + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if self._is_entry(obj): + entries.append(obj) + return + + if 'continuationCommand' in obj: + c['continuation'] = obj + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return entries, try_get(c, lambda x: x["continuation"]) + def _entries(self, page, playlist_id): - more_widget_html = content_html = page + seen = [] + + yt_conf = {} + for m in re.finditer(self._YTCFG_DATA_RE, page): + parsed = self._parse_json(m.group(1), playlist_id, + transform_source=js_to_json, fatal=False) + if parsed: + yt_conf.update(parsed) + + data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - for entry in self._process_page(content_html): + entries, continuation = self._find_entries_in_json(data_json) + processed = self._process_entries(entries, seen) + + if not processed: + break + for entry in processed: yield entry - mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) - if not mobj: + if not continuation or not yt_conf: + break + continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token']) + continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl']) + if not continuation_token or not continuation_url: break count = 0 @@ -314,12 +361,22 @@ def _entries(self, page, playlist_id): try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), + data_json = self._download_json( + 'https://www.youtube.com%s' % continuation_url, + playlist_id, + 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) + query={ + 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) + }, + data=bytes(json.dumps({ + 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), + 'continuation': continuation_token + }), encoding='utf-8'), + headers={ + 'Content-Type': 'application/json' + } + ) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -328,31 +385,30 @@ def _entries(self, page, playlist_id): continue raise - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + def _extract_title(self, renderer): + title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + if title: + return title + return try_get(renderer, lambda x: x['title']['simpleText'], compat_str) class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) + def _is_entry(self, obj): + return 'videoId' in obj - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': + def _process_entries(self, entries, seen): + ids_in_page = [] + titles_in_page = [] + for renderer in entries: + video_id = try_get(renderer, lambda x: x['videoId']) + video_title = self._extract_title(renderer) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or title extraction broke continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None + + video_title = video_title.strip() + try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: @@ -361,19 +417,16 @@ def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_p ids_in_page.append(video_id) titles_in_page.append(video_title) - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + for video_id, video_title in zip(ids_in_page, titles_in_page): + yield self.url_result(video_id, 'Youtube', video_id, video_title) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): + def _is_entry(self, obj): + return 'playlistId' in obj + + def _process_entries(self, entries, seen): + for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -3240,11 +3293,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): }] -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -3341,11 +3390,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3357,28 +3405,14 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] - def _find_videos_in_json(self, extracted): - videos = [] + def _process_json_dict(self, obj, videos, c): + if "videoId" in obj: + videos.append(obj) + return - def _real_find(obj): - if obj is None or isinstance(obj, str): - return - - if type(obj) is list: - for elem in obj: - _real_find(elem) - - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos + if "nextContinuationData" in obj: + c["continuation"] = obj["nextContinuationData"] + return def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) @@ -3413,7 +3447,8 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) + data_json = self._process_initial_data(webpage) + return self.playlist_result(self._process_data(data_json), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): @@ -3435,14 +3470,12 @@ def _real_extract(self, url): 'https://www.youtube.com/show/%s/playlists' % playlist_id) -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3451,96 +3484,35 @@ def IE_NAME(self): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): - videos = [] - c = {} + def _process_entries(self, entries, seen): + new_info = [] + for v in entries: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue - def _real_find(obj): - if obj is None or isinstance(obj, str): - return + have_video = False + for old in seen: + if old['videoId'] == v_id: + have_video = True + break - if type(obj) is list: - for elem in obj: - _real_find(elem) + if not have_video: + new_info.append(v) - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return + if not new_info: + return - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos, try_get(c, lambda x: x["continuation"]) - - def _entries(self, page): - info = [] - - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) - - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) - - for page_num in itertools.count(1): - video_info, continuation = self._find_videos_in_json(search_response) - - new_info = [] - - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break - - if not have_video: - new_info.append(v) - - if not new_info: - break - - info.extend(new_info) - - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) - - if not continuation or not yt_conf: - break - - search_response = self._download_json( - 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - query={ - "ctoken": try_get(continuation, lambda x: x["continuation"]), - "continuation": try_get(continuation, lambda x: x["continuation"]), - "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) - }, - headers={ - "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), - "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), - "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), - "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), - "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), - "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), - }) + seen.extend(new_info) + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video)) def _real_extract(self, url): page = self._download_webpage( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE), + playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): From 8f109ad4ad6bc734f817ccf3daefb9ed603d7480 Mon Sep 17 00:00:00 2001 From: Roman Karwacik <roman.karwacik@rwth-aachen.de> Date: Tue, 10 Nov 2020 10:39:57 +0100 Subject: [PATCH 39/49] [zoom] Fix url parsing for url's containing /share/ and dots --- youtube_dlc/extractor/zoom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/zoom.py b/youtube_dlc/extractor/zoom.py index 003e1f901d..038a902977 100644 --- a/youtube_dlc/extractor/zoom.py +++ b/youtube_dlc/extractor/zoom.py @@ -13,7 +13,7 @@ class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P<id>[A-Za-z0-9\-_]+)' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/(play|share)/(?P<id>[A-Za-z0-9\-_.]+)' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', From 002ea8fe172c0bf234fd15d3775a527706843fc3 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Tue, 27 Oct 2020 16:48:23 +0530 Subject: [PATCH 40/49] Fix external downloader when there is no http_header --- youtube_dlc/downloader/external.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/downloader/external.py b/youtube_dlc/downloader/external.py index c31f8910ad..d2f8f271d3 100644 --- a/youtube_dlc/downloader/external.py +++ b/youtube_dlc/downloader/external.py @@ -115,8 +115,10 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') @@ -150,8 +152,9 @@ class AxelFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -162,8 +165,9 @@ class WgetFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') if len(retry) == 2: @@ -189,8 +193,9 @@ def _make_cmd(self, tmpfilename, info_dict): if dn: cmd += ['--dir', dn] cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') @@ -206,8 +211,10 @@ def available(cls): def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] return cmd @@ -253,7 +260,7 @@ def _call_downloader(self, tmpfilename, info_dict): # if end_time: # args += ['-t', compat_str(end_time - start_time)] - if info_dict['http_headers'] and re.match(r'^https?://', url): + if info_dict.get('http_headers') is not None and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. headers = handle_youtubedl_headers(info_dict['http_headers']) From d7aec208f2a2ef883c7ffb14c0c4ceb4c9c9ddfa Mon Sep 17 00:00:00 2001 From: rigstot <rigstot@users.noreply.github.com> Date: Sun, 19 Jul 2020 15:07:29 +0200 Subject: [PATCH 41/49] implement ThisVid extractor deobfuscates the video URL using a reverse engineered version of KVS player's algorithm. This was tested against version 4.0.4, 5.0.1, 5.1.1.4 and 5.2.0.4 of the player and a warning will be issued if the major version changes. --- youtube_dlc/extractor/extractors.py | 1 + youtube_dlc/extractor/thisvid.py | 97 +++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dlc/extractor/thisvid.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 666134d868..ee404f78d0 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1175,6 +1175,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ThisVidIE from .threeqsdn import ThreeQSDNIE from .tiktok import TikTokIE from .tinypic import TinyPicIE diff --git a/youtube_dlc/extractor/thisvid.py b/youtube_dlc/extractor/thisvid.py new file mode 100644 index 0000000000..f507e1b067 --- /dev/null +++ b/youtube_dlc/extractor/thisvid.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+/?)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/french-boy-pantsed/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/2400174/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id = self._match_id(url) + webpage = self._download_webpage(url, main_id) + + # URL decryptor was reversed from version 4.0.4, later verified working with 5.2.0 and may change in the future. + kvs_version = self._html_search_regex(r'<script [^>]+?src="https://thisvid\.com/player/kt_player\.js\?v=(\d+(\.\d+)+)">', webpage, 'kvs_version', fatal=False) + if not kvs_version.startswith("5."): + self.report_warning("Major version change (" + kvs_version + ") in player engine--Download may fail.") + + title = self._html_search_regex(r'<title>(?:Video: )?(.+?)(?: - (?:\w+ porn at )?ThisVid(?:.com| tube))?', webpage, 'title') + # video_id, video_url and license_code from the 'flashvars' JSON object: + video_id = self._html_search_regex(r"video_id: '([0-9]+)',", webpage, 'video_id') + video_url = self._html_search_regex(r"video_url: '(function/0/.+?)',", webpage, 'video_url') + license_code = self._html_search_regex(r"license_code: '([0-9$]{16})',", webpage, 'license_code') + thumbnail = self._html_search_regex(r"preview_url: '((?:https?:)?//media.thisvid.com/.+?.jpg)',", webpage, 'thumbnail', fatal=False) + if thumbnail.startswith("//"): + thumbnail = "https:" + thumbnail + if (re.match(self._VALID_URL, url).group('type') == "videos"): + display_id = main_id + else: + display_id = self._search_regex(r'', webpage, 'display_id', fatal=False), + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'url': getrealurl(video_url, license_code), + 'thumbnail': thumbnail, + 'age_limit': 18, + } + + +def getrealurl(video_url, license_code): + urlparts = video_url.split('/')[2:] + license = getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + for o in range(len(newmagic) - 1, -1, -1): + new = "" + l = (o + sum([int(n) for n in license[o:]])) % 32 + + for i in range(0, len(newmagic)): + if i == o: + new += newmagic[l] + elif i == l: + new += newmagic[o] + else: + new += newmagic[i] + newmagic = new + + urlparts[5] = newmagic + urlparts[5][32:] + return "/".join(urlparts) + + +def getlicensetoken(license): + modlicense = license.replace("$", "").replace("0", "1") + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = str(4 * abs(fronthalf - backhalf)) + retval = "" + for o in range(0, center + 1): + for i in range(1, 5): + retval += str((int(license[o + i]) + int(modlicense[o])) % 10) + return retval From 0f8566e90bee77775be133d551045698a84a2bdd Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 10 Nov 2020 23:20:52 +0100 Subject: [PATCH 42/49] manually set limit for youtubesearchurl --- youtube_dlc/extractor/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 273d823c22..0dbb3531c1 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -327,7 +327,7 @@ def _real_find(obj): return entries, try_get(c, lambda x: x["continuation"]) - def _entries(self, page, playlist_id): + def _entries(self, page, playlist_id, n=1): seen = [] yt_conf = {} @@ -339,7 +339,8 @@ def _entries(self, page, playlist_id): data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - for page_num in itertools.count(1): + # for page_num in itertools.count(1): + for page_num in range(n): entries, continuation = self._find_entries_in_json(data_json) processed = self._process_entries(entries, seen) @@ -3447,8 +3448,8 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - data_json = self._process_initial_data(webpage) - return self.playlist_result(self._process_data(data_json), playlist_title=query) + # data_json = self._process_initial_data(webpage) + return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 73ac85678588b1c2997a94c0069ac0a9309adf19 Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Tue, 10 Nov 2020 17:47:40 -0500 Subject: [PATCH 43/49] [youtube] max_pages=5 for search, unlimited for everything else Also drop a few leftover methods in search that are no longer used. --- youtube_dlc/extractor/youtube.py | 39 ++++---------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d8d12a7210..2fea11070a 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -328,7 +328,7 @@ def _real_find(obj): return entries, try_get(c, lambda x: x["continuation"]) - def _entries(self, page, playlist_id, n=1): + def _entries(self, page, playlist_id, max_pages=None): seen = [] yt_conf = {} @@ -340,8 +340,7 @@ def _entries(self, page, playlist_id, n=1): data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - # for page_num in itertools.count(1): - for page_num in range(n): + for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1): entries, continuation = self._find_entries_in_json(data_json) processed = self._process_entries(entries, seen) @@ -366,7 +365,7 @@ def _entries(self, page, playlist_id, n=1): data_json = self._download_json( 'https://www.youtube.com%s' % continuation_url, playlist_id, - 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), + 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, query={ @@ -3418,41 +3417,11 @@ def _process_json_dict(self, obj, videos, c): c["continuation"] = obj["nextContinuationData"] return - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - - result_items = self._find_videos_in_json(search_response) - - for renderer in result_items: - video_id = try_get(renderer, lambda x: x['videoId']) - video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or title extraction broke - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - # data_json = self._process_initial_data(webpage) - return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query) + return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 104bfdd24de9dd5f636887afd8b263a4c53673a7 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 00:00:27 +0100 Subject: [PATCH 44/49] ytsearchurl 5 pages for around 100 results --- youtube_dlc/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 2fea11070a..d5d25859d1 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3421,7 +3421,7 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query) + return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From b28e751688a71f37ef6e468faf940bccb311afa9 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Wed, 11 Nov 2020 00:40:43 +0100 Subject: [PATCH 45/49] [skip travis] --- youtube_dlc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py index 440d8e4882..3c68ae5eb3 100644 --- a/youtube_dlc/version.py +++ b/youtube_dlc/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.10.25' +__version__ = '2020.11.11-1' From 6bd79800c3c5d3a91561ee34a87dbaa9e8319ae9 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 15:05:18 +0100 Subject: [PATCH 46/49] [youtube] python2 fix #168 proposed fix by awei78 --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d5d25859d1..629a82c974 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -371,10 +371,10 @@ def _entries(self, page, playlist_id, max_pages=None): query={ 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) }, - data=bytes(json.dumps({ + data= str(json.dumps({ 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), 'continuation': continuation_token - }), encoding='utf-8'), + })).encode(encoding='UTF-8',errors='strict'), headers={ 'Content-Type': 'application/json' } From c297a6c6619989f15b41935e49addff1d27e4e41 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Wed, 11 Nov 2020 15:08:12 +0100 Subject: [PATCH 47/49] [skip travis] --- youtube_dlc/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py index 3c68ae5eb3..201a981cf9 100644 --- a/youtube_dlc/version.py +++ b/youtube_dlc/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.11.11-1' +__version__ = '2020.11.11-2' From 5e6cdcecdd1ac74592f27766ef38a3ae059d4ae7 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 15:15:24 +0100 Subject: [PATCH 48/49] flake8 yt py2 fix --- youtube_dlc/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 629a82c974..97cc793f9a 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -371,10 +371,10 @@ def _entries(self, page, playlist_id, max_pages=None): query={ 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) }, - data= str(json.dumps({ + data=str(json.dumps({ 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), 'continuation': continuation_token - })).encode(encoding='UTF-8',errors='strict'), + })).encode(encoding='UTF-8', errors='strict'), headers={ 'Content-Type': 'application/json' } From d052b9a112fb7ae749a829dceba6e3289663a303 Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Wed, 11 Nov 2020 15:39:00 +0100 Subject: [PATCH 49/49] [skip travis] typo --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4920a30b80..dd6a95256f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -113,7 +113,7 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.5.4 32-Bit + - name: Set up Python 3.4.4 32-Bit uses: actions/setup-python@v2 with: python-version: '3.4.4'