[itv] Make SOAP request non fatal and extract metadata from a webpage (closes #16780)

2025-01-31 01:21:29 +01:00 · 2018-06-21 23:06:58 +07:00 · 2018-06-21 23:06:58 +07:00 · 30374f4d40
commit 30374f4d40
parent 91aa502d91
1 changed files with 69 additions and 57 deletions
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@ -18,6 +18,7 @@
    xpath_element,
    xpath_text,
    int_or_none,
+    merge_dicts,
    parse_duration,
    smuggle_url,
    ExtractorError,
@ -129,64 +130,65 @@ def extract_subtitle(sub_url):

        resp_env = self._download_xml(
            params['data-playlist-url'], video_id,
-            headers=headers, data=etree.tostring(req_env))
-        playlist = xpath_element(resp_env, './/Playlist')
-        if playlist is None:
-            fault_code = xpath_text(resp_env, './/faultcode')
-            fault_string = xpath_text(resp_env, './/faultstring')
-            if fault_code == 'InvalidGeoRegion':
-                self.raise_geo_restricted(
-                    msg=fault_string, countries=self._GEO_COUNTRIES)
-            elif fault_code not in (
-                    'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
-                raise ExtractorError(
-                    '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
-            info.update({
-                'title': self._og_search_title(webpage),
-                'episode_title': params.get('data-video-episode'),
-                'series': params.get('data-video-title'),
-            })
-        else:
-            title = xpath_text(playlist, 'EpisodeTitle', default=None)
-            info.update({
-                'title': title,
-                'episode_title': title,
-                'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
-                'series': xpath_text(playlist, 'ProgrammeTitle'),
-                'duration': parse_duration(xpath_text(playlist, 'Duration')),
-            })
-            video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
-            media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
-            rtmp_url = media_files.attrib['base']
+            headers=headers, data=etree.tostring(req_env), fatal=False)
+        if resp_env:
+            playlist = xpath_element(resp_env, './/Playlist')
+            if playlist is None:
+                fault_code = xpath_text(resp_env, './/faultcode')
+                fault_string = xpath_text(resp_env, './/faultstring')
+                if fault_code == 'InvalidGeoRegion':
+                    self.raise_geo_restricted(
+                        msg=fault_string, countries=self._GEO_COUNTRIES)
+                elif fault_code not in (
+                        'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
+                    raise ExtractorError(
+                        '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
+                info.update({
+                    'title': self._og_search_title(webpage),
+                    'episode_title': params.get('data-video-episode'),
+                    'series': params.get('data-video-title'),
+                })
+            else:
+                title = xpath_text(playlist, 'EpisodeTitle', default=None)
+                info.update({
+                    'title': title,
+                    'episode_title': title,
+                    'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
+                    'series': xpath_text(playlist, 'ProgrammeTitle'),
+                    'duration': parse_duration(xpath_text(playlist, 'Duration')),
+                })
+                video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
+                media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
+                rtmp_url = media_files.attrib['base']

-            for media_file in media_files.findall('MediaFile'):
-                play_path = xpath_text(media_file, 'URL')
-                if not play_path:
-                    continue
-                tbr = int_or_none(media_file.get('bitrate'), 1000)
-                f = {
-                    'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
-                    'play_path': play_path,
-                    # Providing this swfVfy allows to avoid truncated downloads
-                    'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
-                    'page_url': url,
-                    'tbr': tbr,
-                    'ext': 'flv',
-                }
-                app = self._search_regex(
-                    'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
-                if app:
-                    f.update({
-                        'url': rtmp_url.split('?', 1)[0],
-                        'app': app,
-                    })
-                else:
-                    f['url'] = rtmp_url
-                formats.append(f)
+                for media_file in media_files.findall('MediaFile'):
+                    play_path = xpath_text(media_file, 'URL')
+                    if not play_path:
+                        continue
+                    tbr = int_or_none(media_file.get('bitrate'), 1000)
+                    f = {
+                        'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
+                        'play_path': play_path,
+                        # Providing this swfVfy allows to avoid truncated downloads
+                        'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
+                        'page_url': url,
+                        'tbr': tbr,
+                        'ext': 'flv',
+                    }
+                    app = self._search_regex(
+                        'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
+                    if app:
+                        f.update({
+                            'url': rtmp_url.split('?', 1)[0],
+                            'app': app,
+                        })
+                    else:
+                        f['url'] = rtmp_url
+                    formats.append(f)

-            for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
-                if caption_url.text:
-                    extract_subtitle(caption_url.text)
+                for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
+                    if caption_url.text:
+                        extract_subtitle(caption_url.text)

        ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
        hmac = params.get('data-video-hmac')
@ -261,7 +263,17 @@ def extract_subtitle(sub_url):
            'formats': formats,
            'subtitles': subtitles,
        })
-        return info
+
+        webpage_info = self._search_json_ld(webpage, video_id, default={})
+        if not webpage_info.get('title'):
+            webpage_info['title'] = self._html_search_regex(
+                r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
+                webpage, 'title', default=None) or self._og_search_title(
+                webpage, default=None) or self._html_search_meta(
+                'twitter:title', webpage, 'title',
+                default=None) or webpage_info['episode']
+
+        return merge_dicts(info, webpage_info)


 class ITVBTCCIE(InfoExtractor):