From 03025b6e105139d01cd415ddc51fd692957fd2ba Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Thu, 16 Mar 2023 14:53:18 -0500 Subject: [PATCH] [extractor/mediastream] Improve `WinSports` and embed extraction (#6426) Closes #6419, Closes #6527 Authored by: bashonly --- yt_dlp/extractor/mediastream.py | 102 +++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 36 deletions(-) diff --git a/yt_dlp/extractor/mediastream.py b/yt_dlp/extractor/mediastream.py index e8d427a319..cef769f299 100644 --- a/yt_dlp/extractor/mediastream.py +++ b/yt_dlp/extractor/mediastream.py @@ -2,16 +2,44 @@ from .common import InfoExtractor from ..utils import ( + clean_html, remove_end, - str_or_none, - strip_or_none, traverse_obj, urljoin, ) -class MediaStreamIE(InfoExtractor): - _VALID_URL = r'https?://mdstrm.com/(?:embed|live-stream)/(?P\w+)' +class MediaStreamBaseIE(InfoExtractor): + _EMBED_BASE_URL = 'https://mdstrm.com/embed' + _BASE_URL_RE = r'https?://mdstrm\.com/(?:embed|live-stream)' + + def _extract_mediastream_urls(self, webpage): + yield from traverse_obj(list(self._yield_json_ld(webpage, None)), ( + lambda _, v: v['@type'] == 'VideoObject', ('embedUrl', 'contentUrl'), + {lambda x: x if re.match(rf'{self._BASE_URL_RE}/\w+', x) else None})) + + for mobj in re.finditer(r']+>[^>]*playerMdStream\.mdstreamVideo\(\s*[\'"](?P\w+)', webpage): + yield f'{self._EMBED_BASE_URL}/{mobj.group("video_id")}' + + yield from re.findall( + rf']+\bsrc="({self._BASE_URL_RE}/\w+)', webpage) + + for mobj in re.finditer( + r'''(?x) + <(?:div|ps-mediastream)[^>]+ + (class="[^"]*MediaStreamVideoPlayer)[^"]*"[^>]+ + data-video-id="(?P\w+)" + (?:\s*data-video-type="(?P[^"]+))? + (?:[^>]*>\s*]+\1[^"]*"[^>]+data-mediastream=["\'][^>]+ + https://mdstrm\.com/(?Plive-stream))? + ''', webpage): + + video_type = 'live-stream' if mobj.group('video_type') == 'live' or mobj.group('live') else 'embed' + yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}' + + +class MediaStreamIE(MediaStreamBaseIE): + _VALID_URL = MediaStreamBaseIE._BASE_URL_RE + r'/(?P\w+)' _TESTS = [{ 'url': 'https://mdstrm.com/embed/6318e3f1d1d316083ae48831', @@ -23,6 +51,7 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': r're:^https?://[^?#]+6318e3f1d1d316083ae48831', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }] _WEBPAGE_TESTS = [{ @@ -35,9 +64,7 @@ class MediaStreamIE(InfoExtractor): 'ext': 'mp4', 'live_status': 'is_live', }, - 'params': { - 'skip_download': 'Livestream' - }, + 'params': {'skip_download': 'Livestream'}, }, { 'url': 'https://www.multimedios.com/television/clases-de-llaves-y-castigos-quien-sabe-mas', 'md5': 'de31f0b1ecc321fb35bf22d58734ea40', @@ -48,6 +75,7 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': 're:^https?://[^?#]+63731bab8ec9b308a2c9ed28', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.americatv.com.pe/videos/esto-es-guerra/facundo-gonzalez-sufrio-fuerte-golpe-durante-competencia-frente-hugo-garcia-eeg-noticia-139120', 'info_dict': { @@ -57,6 +85,7 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': 're:^https?://[^?#]+63756df1c638b008a5659dec', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.americatv.com.pe/videos/al-fondo-hay-sitio/nuevas-lomas-town-bernardo-mata-se-enfrento-sujeto-luchar-amor-macarena-noticia-139083', 'info_dict': { @@ -66,26 +95,12 @@ class MediaStreamIE(InfoExtractor): 'thumbnail': 're:^https?://[^?#]+637307669609130f74cd3a6e', 'ext': 'mp4', }, + 'params': {'skip_download': 'm3u8'}, }] - @classmethod - def _extract_embed_urls(cls, url, webpage): - for mobj in re.finditer(r']+>[^>]*playerMdStream.mdstreamVideo\(\s*[\'"](?P\w+)', webpage): - yield f'https://mdstrm.com/embed/{mobj.group("video_id")}' - - yield from re.findall( - r']src\s*=\s*"(https://mdstrm.com/[\w-]+/\w+)', webpage) - - for mobj in re.finditer( - r'''(?x) - <(?:div|ps-mediastream)[^>]+ - class\s*=\s*"[^"]*MediaStreamVideoPlayer[^"]*"[^>]+ - data-video-id\s*=\s*"(?P\w+)\s*" - (?:\s*data-video-type\s*=\s*"(?P[^"]+))? - ''', webpage): - - video_type = 'live-stream' if mobj.group('video_type') == 'live' else 'embed' - yield f'https://mdstrm.com/{video_type}/{mobj.group("video_id")}' + def _extract_from_webpage(self, url, webpage): + for embed_url in self._extract_mediastream_urls(webpage): + yield self.url_result(embed_url, MediaStreamIE, None) def _real_extract(self, url): video_id = self._match_id(url) @@ -94,7 +109,7 @@ def _real_extract(self, url): if 'Debido a tu ubicación no puedes ver el contenido' in webpage: self.raise_geo_restricted() - player_config = self._search_json(r'window.MDSTRM.OPTIONS\s*=', webpage, 'metadata', video_id) + player_config = self._search_json(r'window\.MDSTRM\.OPTIONS\s*=', webpage, 'metadata', video_id) formats, subtitles = [], {} for video_format in player_config['src']: @@ -122,7 +137,7 @@ def _real_extract(self, url): } -class WinSportsVideoIE(InfoExtractor): +class WinSportsVideoIE(MediaStreamBaseIE): _VALID_URL = r'https?://www\.winsports\.co/videos/(?P[\w-]+)' _TESTS = [{ @@ -158,21 +173,36 @@ class WinSportsVideoIE(InfoExtractor): 'ext': 'mp4', }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.winsports.co/videos/bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta', + 'info_dict': { + 'id': '6402adb62bbf3b18d454e1b0', + 'display_id': 'bucaramanga-se-quedo-con-el-grito-de-gol-en-la-garganta', + 'title': '⚽Bucaramanga se quedó con el grito de gol en la garganta', + 'description': 'Gol anulado Bucaramanga', + 'thumbnail': r're:^https?://[^?#]+6402adb62bbf3b18d454e1b0', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - json_ld = self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={}) - media_setting_json = self._search_json( - r']+data-drupal-selector="drupal-settings-json">', webpage, 'drupal-setting-json', display_id) + data = self._search_json( + r']+data-drupal-selector="drupal-settings-json">', webpage, 'data', display_id) - mediastream_id = traverse_obj( - media_setting_json, ('settings', 'mediastream_formatter', ..., 'mediastream_id', {str_or_none}), - get_all=False) or json_ld.get('url') - if not mediastream_id: + mediastream_url = urljoin(f'{self._EMBED_BASE_URL}/', ( + traverse_obj(data, ( + (('settings', 'mediastream_formatter', ..., 'mediastream_id'), 'url'), {str}), get_all=False) + or next(self._extract_mediastream_urls(webpage), None))) + + if not mediastream_url: self.raise_no_formats('No MediaStream embed found in webpage') + title = clean_html(remove_end( + self._search_json_ld(webpage, display_id, expected_type='VideoObject', default={}).get('title') + or self._og_search_title(webpage), '| Win Sports')) + return self.url_result( - urljoin('https://mdstrm.com/embed/', mediastream_id), MediaStreamIE, display_id, url_transparent=True, - display_id=display_id, video_title=strip_or_none(remove_end(json_ld.get('title'), '| Win Sports'))) + mediastream_url, MediaStreamIE, display_id, url_transparent=True, display_id=display_id, video_title=title)