[extractor/youtube] Update playlist metadata extraction for new layout (#5376)

Fixes https://github.com/yt-dlp/yt-dlp/issues/5373

Authored by: coletdjnz
This commit is contained in:
Matthew 2022-11-06 18:25:31 +13:00 committed by GitHub
parent 59a0c35865
commit 6141346d18
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -904,20 +904,24 @@ def _extract_video(self, renderer):
video_id = renderer.get('videoId') video_id = renderer.get('videoId')
title = self._get_text(renderer, 'title') title = self._get_text(renderer, 'title')
description = self._get_text(renderer, 'descriptionSnippet') description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) duration = int_or_none(renderer.get('lengthSeconds'))
if duration is None:
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
if duration is None: if duration is None:
duration = parse_duration(self._search_regex( duration = parse_duration(self._search_regex(
r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$',
traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str),
video_id, default=None, group='duration')) video_id, default=None, group='duration'))
view_count = self._get_count(renderer, 'viewCountText', 'shortViewCountText') # videoInfo is a string like '50K views • 10 years ago'.
view_count = self._get_count(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo')
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
channel_id = traverse_obj( channel_id = traverse_obj(
renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'),
expected_type=str, get_all=False) expected_type=str, get_all=False)
time_text = self._get_text(renderer, 'publishedTimeText') or '' time_text = self._get_text(renderer, 'publishedTimeText', 'videoInfo') or ''
scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False))
overlay_style = traverse_obj( overlay_style = traverse_obj(
renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'),
@ -4583,50 +4587,36 @@ def _extract_selected_tab(tabs, fatal=True):
if fatal: if fatal:
raise ExtractorError('Unable to find selected tab') raise ExtractorError('Unable to find selected tab')
def _extract_uploader(self, data):
uploader = {}
renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {}
owner = try_get(
renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
if owner:
owner_text = owner.get('text')
uploader['uploader'] = self._search_regex(
r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text)
uploader['uploader_id'] = try_get(
owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], str)
uploader['uploader_url'] = urljoin(
'https://www.youtube.com/',
try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], str))
return filter_dict(uploader)
def _extract_from_tabs(self, item_id, ytcfg, data, tabs): def _extract_from_tabs(self, item_id, ytcfg, data, tabs):
playlist_id = title = description = channel_url = channel_name = channel_id = None playlist_id = title = description = channel_url = channel_name = channel_id = None
tags = [] tags = []
selected_tab = self._extract_selected_tab(tabs) selected_tab = self._extract_selected_tab(tabs)
# Deprecated - remove when layout discontinued
primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
renderer = try_get( playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer'), expected_type=dict)
metadata_renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict) data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
if renderer: if metadata_renderer:
channel_name = renderer.get('title') channel_name = metadata_renderer.get('title')
channel_url = renderer.get('channelUrl') channel_url = metadata_renderer.get('channelUrl')
channel_id = renderer.get('externalId') channel_id = metadata_renderer.get('externalId')
else: else:
renderer = try_get( metadata_renderer = try_get(
data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
if renderer: if metadata_renderer:
title = renderer.get('title') title = metadata_renderer.get('title')
description = renderer.get('description', '') description = metadata_renderer.get('description', '')
playlist_id = channel_id playlist_id = channel_id
tags = renderer.get('keywords', '').split() tags = metadata_renderer.get('keywords', '').split()
# We can get the uncropped banner/avatar by replacing the crop params with '=s0' # We can get the uncropped banner/avatar by replacing the crop params with '=s0'
# See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714 # See: https://github.com/yt-dlp/yt-dlp/issues/2237#issuecomment-1013694714
def _get_uncropped(url): def _get_uncropped(url):
return url_or_none((url or '').split('=')[0] + '=s0') return url_or_none((url or '').split('=')[0] + '=s0')
avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar') avatar_thumbnails = self._extract_thumbnails(metadata_renderer, 'avatar')
if avatar_thumbnails: if avatar_thumbnails:
uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url']) uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url'])
if uncropped_avatar: if uncropped_avatar:
@ -4650,14 +4640,33 @@ def _get_uncropped(url):
'preference': -5 'preference': -5
}) })
# Deprecated - remove when old layout is discontinued
primary_thumbnails = self._extract_thumbnails( primary_thumbnails = self._extract_thumbnails(
primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail'))
playlist_thumbnails = self._extract_thumbnails(
playlist_header_renderer, ('playlistHeaderBanner', 'heroPlaylistThumbnailRenderer', 'thumbnail'))
if playlist_id is None: if playlist_id is None:
playlist_id = item_id playlist_id = item_id
playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats') # Deprecated - remove primary_sidebar_renderer when old layout discontinued
last_updated_unix = self._parse_time_text(self._get_text(playlist_stats, 2)) # Playlist stats is a text runs array containing [video count, view count, last updated].
# last updated or (view count and last updated) may be missing.
playlist_stats = get_first(
(primary_sidebar_renderer, playlist_header_renderer), (('stats', 'briefStats', 'numVideosText'),))
last_updated_unix = self._parse_time_text(
self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued
or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text')))
view_count = self._get_count(playlist_stats, 1)
if view_count is None:
view_count = self._get_count(playlist_header_renderer, 'viewCountText')
playlist_count = self._get_count(playlist_stats, 0)
if playlist_count is None:
playlist_count = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text'))
if title is None: if title is None:
title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
title += format_field(selected_tab, 'title', ' - %s') title += format_field(selected_tab, 'title', ' - %s')
@ -4670,16 +4679,29 @@ def _get_uncropped(url):
'uploader': channel_name, 'uploader': channel_name,
'uploader_id': channel_id, 'uploader_id': channel_id,
'uploader_url': channel_url, 'uploader_url': channel_url,
'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners, 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners,
'tags': tags, 'tags': tags,
'view_count': self._get_count(playlist_stats, 1), 'view_count': view_count,
'availability': self._extract_availability(data), 'availability': self._extract_availability(data),
'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'), 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
'playlist_count': self._get_count(playlist_stats, 0), 'playlist_count': playlist_count,
'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')),
} }
if not channel_id: if not channel_id:
metadata.update(self._extract_uploader(data)) owner = traverse_obj(playlist_header_renderer, 'ownerText')
if not owner:
# Deprecated
owner = traverse_obj(
self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer'),
('videoOwner', 'videoOwnerRenderer', 'title'))
owner_text = self._get_text(owner)
browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {}
metadata.update(filter_dict({
'uploader': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text),
'uploader_id': browse_ep.get('browseId'),
'uploader_url': urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl'))
}))
metadata.update({ metadata.update({
'channel': metadata['uploader'], 'channel': metadata['uploader'],
'channel_id': metadata['uploader_id'], 'channel_id': metadata['uploader_id'],
@ -4751,19 +4773,21 @@ def _extract_availability(self, data):
Note: Unless YouTube tells us explicitly, we do not assume it is public Note: Unless YouTube tells us explicitly, we do not assume it is public
@param data: response @param data: response
""" """
renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {}
playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {}
player_header_privacy = playlist_header_renderer.get('privacy')
player_header_privacy = traverse_obj( badges = self._extract_badges(sidebar_renderer)
data, ('header', 'playlistHeaderRenderer', 'privacy'), expected_type=str)
badges = self._extract_badges(renderer)
# Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge
privacy_setting_icon = traverse_obj( privacy_setting_icon = get_first(
renderer, ( (playlist_header_renderer, sidebar_renderer),
'privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries', ('privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries',
lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'), lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'),
get_all=False, expected_type=str) expected_type=str)
microformats_is_unlisted = traverse_obj(
data, ('microformat', 'microformatDataRenderer', 'unlisted'), expected_type=bool)
return ( return (
'public' if ( 'public' if (
@ -4778,7 +4802,8 @@ def _extract_availability(self, data):
is_unlisted=( is_unlisted=(
self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED)
or player_header_privacy == 'UNLISTED' if player_header_privacy is not None or player_header_privacy == 'UNLISTED' if player_header_privacy is not None
else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None else None), else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None
else microformats_is_unlisted if microformats_is_unlisted is not None else None),
needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None,
needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None,
needs_auth=False)) needs_auth=False))
@ -4794,39 +4819,23 @@ def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict):
def _reload_with_unavailable_videos(self, item_id, data, ytcfg): def _reload_with_unavailable_videos(self, item_id, data, ytcfg):
""" """
Get playlist with unavailable videos if the 'show unavailable videos' button exists. Reload playlists with unavailable videos (e.g. private videos, region blocked, etc.)
""" """
browse_id = params = None is_playlist = bool(traverse_obj(
renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') data, ('metadata', 'playlistMetadataRenderer'), ('header', 'playlistHeaderRenderer')))
if not renderer: if not is_playlist:
return return
menu_renderer = try_get(
renderer, lambda x: x['menu']['menuRenderer']['items'], list) or []
for menu_item in menu_renderer:
if not isinstance(menu_item, dict):
continue
nav_item_renderer = menu_item.get('menuNavigationItemRenderer')
text = try_get(
nav_item_renderer, lambda x: x['text']['simpleText'], str)
if not text or text.lower() != 'show unavailable videos':
continue
browse_endpoint = try_get(
nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {}
browse_id = browse_endpoint.get('browseId')
params = browse_endpoint.get('params')
break
headers = self.generate_api_headers( headers = self.generate_api_headers(
ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data),
visitor_data=self._extract_visitor_data(data, ytcfg)) visitor_data=self._extract_visitor_data(data, ytcfg))
query = { query = {
'params': params or 'wgYCCAA=', 'params': 'wgYCCAA=',
'browseId': browse_id or 'VL%s' % item_id 'browseId': f'VL{item_id}'
} }
return self._extract_response( return self._extract_response(
item_id=item_id, headers=headers, query=query, item_id=item_id, headers=headers, query=query,
check_get_keys='contents', fatal=False, ytcfg=ytcfg, check_get_keys='contents', fatal=False, ytcfg=ytcfg,
note='Downloading API JSON with unavailable videos') note='Redownloading playlist API JSON with unavailable videos')
@functools.cached_property @functools.cached_property
def skip_webpage(self): def skip_webpage(self):
@ -5324,6 +5333,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'channel_url': 'https://www.youtube.com/user/Computerphile', 'channel_url': 'https://www.youtube.com/user/Computerphile',
'channel': 'Computerphile', 'channel': 'Computerphile',
'availability': 'public', 'availability': 'public',
'modified_date': '20190712',
}, },
'playlist_mincount': 11, 'playlist_mincount': 11,
}, { }, {
@ -5659,6 +5669,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor):
'uploader': 'cole-dlp-test-acc', 'uploader': 'cole-dlp-test-acc',
'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA',
'channel': 'cole-dlp-test-acc', 'channel': 'cole-dlp-test-acc',
'channel_follower_count': int,
}, },
'playlist_mincount': 1, 'playlist_mincount': 1,
'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}},