From 754940e9a558565d6bd3c0c529802569b1d0ae4e Mon Sep 17 00:00:00 2001 From: sepro Date: Sun, 3 Nov 2024 21:19:35 +0100 Subject: [PATCH] [ie/bfmtv] Fix extractors (#11444) Authored by: seproDev --- yt_dlp/extractor/bfmtv.py | 66 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/yt_dlp/extractor/bfmtv.py b/yt_dlp/extractor/bfmtv.py index 87f011783b..49d4819a3d 100644 --- a/yt_dlp/extractor/bfmtv.py +++ b/yt_dlp/extractor/bfmtv.py @@ -1,18 +1,33 @@ import re from .common import InfoExtractor -from ..utils import extract_attributes +from ..utils import ExtractorError, extract_attributes class BFMTVBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.|rmc\.)?bfmtv\.com/' _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P\d{12})\.html' - _VIDEO_BLOCK_REGEX = r'(]+class="video_block[^"]*"[^>]*>)' + _VIDEO_BLOCK_REGEX = r'(]+class="video_block[^"]*"[^>]*>.*?)' + _VIDEO_ELEMENT_REGEX = r'(]+>)' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' - def _brightcove_url_result(self, video_id, video_block): - account_id = video_block.get('accountid') or '876450612001' - player_id = video_block.get('playerid') or 'I2qBTln4u' + def _extract_video(self, video_block): + video_element = self._search_regex( + self._VIDEO_ELEMENT_REGEX, video_block, 'video element', default=None) + if video_element: + video_element_attrs = extract_attributes(video_element) + video_id = video_element_attrs.get('data-video-id') + if not video_id: + return + account_id = video_element_attrs.get('data-account') or '876450610001' + player_id = video_element_attrs.get('adjustplayer') or '19dszYXgm' + else: + video_block_attrs = extract_attributes(video_block) + video_id = video_block_attrs.get('videoid') + if not video_id: + return + account_id = video_block_attrs.get('accountid') or '876630703001' + player_id = video_block_attrs.get('playerid') or 'KbPwEbuHx' return self.url_result( self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), 'BrightcoveNew', video_id) @@ -40,23 +55,25 @@ class BFMTVIE(BFMTVBaseIE): def _real_extract(self, url): bfmtv_id = self._match_id(url) webpage = self._download_webpage(url, bfmtv_id) - video_block = extract_attributes(self._search_regex( + video = self._extract_video(self._search_regex( self._VIDEO_BLOCK_REGEX, webpage, 'video block')) - return self._brightcove_url_result(video_block['videoid'], video_block) + if not video: + raise ExtractorError('Failed to extract video') + return video -class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE +class BFMTVLiveIE(BFMTVBaseIE): IE_NAME = 'bfmtv:live' _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P(?:[^/]+/)?en-direct)' _TESTS = [{ 'url': 'https://www.bfmtv.com/en-direct/', 'info_dict': { - 'id': '5615950982001', + 'id': '6346069778112', 'ext': 'mp4', - 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r're:^Le Live BFM TV \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'uploader_id': '876450610001', - 'upload_date': '20220926', - 'timestamp': 1664207191, + 'upload_date': '20240202', + 'timestamp': 1706887572, 'live_status': 'is_live', 'thumbnail': r're:https://.+/image\.jpg', 'tags': [], @@ -69,6 +86,15 @@ class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE 'only_matching': True, }] + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + video = self._extract_video(self._search_regex( + self._VIDEO_BLOCK_REGEX, webpage, 'video block')) + if not video: + raise ExtractorError('Failed to extract video') + return video + class BFMTVArticleIE(BFMTVBaseIE): IE_NAME = 'bfmtv:article' @@ -102,18 +128,16 @@ class BFMTVArticleIE(BFMTVBaseIE): }, }] + def _entries(self, webpage): + for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): + video = self._extract_video(video_block_el) + if video: + yield video + def _real_extract(self, url): bfmtv_id = self._match_id(url) webpage = self._download_webpage(url, bfmtv_id) - entries = [] - for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): - video_block = extract_attributes(video_block_el) - video_id = video_block.get('videoid') - if not video_id: - continue - entries.append(self._brightcove_url_result(video_id, video_block)) - return self.playlist_result( - entries, bfmtv_id, self._og_search_title(webpage, fatal=False), + self._entries(webpage), bfmtv_id, self._og_search_title(webpage, fatal=False), self._html_search_meta(['og:description', 'description'], webpage))