[extractor/ted] fix subtitles extraction

This commit is contained in:
kikuyan 2021-07-22 11:55:17 +09:00
parent a803582717
commit 78c50518f9

View File

@ -46,10 +46,8 @@ class TEDIE(InfoExtractor):
'consciousness, but that half the time our brains are ' 'consciousness, but that half the time our brains are '
'actively fooling us.'), 'actively fooling us.'),
'uploader': 'Dan Dennett', 'uploader': 'Dan Dennett',
'width': 853,
'duration': 1308, 'duration': 1308,
'view_count': int, 'view_count': int,
'comment_count': int,
'tags': list, 'tags': list,
}, },
'params': { 'params': {
@ -77,7 +75,7 @@ class TEDIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Be passionate. Be courageous. Be your best.', 'title': 'Be passionate. Be courageous. Be your best.',
'uploader': 'Gabby Giffords and Mark Kelly', 'uploader': 'Gabby Giffords and Mark Kelly',
'description': 'md5:5174aed4d0f16021b704120360f72b92', 'description': 'md5:37c09e06ce87ddfdb65bf0112ea3551c',
'duration': 1128, 'duration': 1128,
}, },
'params': { 'params': {
@ -117,7 +115,6 @@ class TEDIE(InfoExtractor):
'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a', 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
'uploader': 'Tom Thum', 'uploader': 'Tom Thum',
'view_count': int, 'view_count': int,
'comment_count': int,
'tags': list, 'tags': list,
}, },
'params': { 'params': {
@ -308,30 +305,31 @@ class TEDIE(InfoExtractor):
'uploader': player_talk.get('speaker') or talk_info.get('speaker'), 'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'), 'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
'description': self._og_search_description(webpage), 'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info), 'subtitles': self._get_subtitles(player_talk),
'formats': formats, 'formats': formats,
'duration': float_or_none(talk_info.get('duration')), 'duration': float_or_none(talk_info.get('duration')),
'view_count': int_or_none(data.get('viewed_count')), 'view_count': int_or_none(data.get('viewed_count')),
'comment_count': int_or_none(
try_get(data, lambda x: x['comments']['count'])),
'tags': try_get(talk_info, lambda x: x['tags'], list), 'tags': try_get(talk_info, lambda x: x['tags'], list),
} }
def _get_subtitles(self, video_id, talk_info): def _get_subtitles(self, player_talk):
language_list = try_get(player_talk, lambda x: x['languages'], list)
if not language_list:
return {}
metadata = try_get(player_talk, lambda x: x['resources']['hls']['metadata'], compat_str) or ''
proj_master_id = self._search_regex(r'project_masters/([^/]+)/', metadata, 'project master id', fatal=False)
if not proj_master_id:
return {}
sub_lang_list = {} sub_lang_list = {}
for language in try_get( for language in language_list:
talk_info,
(lambda x: x['downloads']['languages'],
lambda x: x['languages']), list):
lang_code = language.get('languageCode') or language.get('ianaCode') lang_code = language.get('languageCode') or language.get('ianaCode')
if not lang_code: if not lang_code:
continue continue
sub_lang_list[lang_code] = [ sub_lang_list[lang_code] = [
{ {
'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext), 'url': 'https://hls.ted.com/project_masters/%s/subtitles/%s/full.vtt' % (proj_master_id, lang_code),
'ext': ext,
} }
for ext in ['ted', 'srt']
] ]
return sub_lang_list return sub_lang_list