From ed711897814f3ee0b1822e4205e74133467e8f1c Mon Sep 17 00:00:00 2001 From: trainman261 Date: Sun, 20 Aug 2023 18:35:57 +0200 Subject: [PATCH] [ie/CBCPlayerPlaylist] Add extractor (#7870) Authored by: trainman261 --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/cbc.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index d4d3b6074c..194ad8356f 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -303,6 +303,7 @@ from .cbc import ( CBCIE, CBCPlayerIE, + CBCPlayerPlaylistIE, CBCGemIE, CBCGemPlaylistIE, CBCGemLiveIE, diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 9413281a57..b3c5471f7b 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -2,6 +2,7 @@ import json import base64 import time +import urllib.parse from .common import InfoExtractor from ..compat import ( @@ -228,6 +229,38 @@ def _real_extract(self, url): } +class CBCPlayerPlaylistIE(InfoExtractor): + IE_NAME = 'cbc.ca:player:playlist' + _VALID_URL = r'https?://(?:www\.)?cbc\.ca/(?:player/)(?!play/)(?P[^?#]+)' + _TESTS = [{ + 'url': 'https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/tv shows/the national/latest broadcast', + } + }, { + 'url': 'https://www.cbc.ca/player/news/Canada/North', + 'playlist_mincount': 25, + 'info_dict': { + 'id': 'news/canada/north', + } + }] + + def _real_extract(self, url): + playlist_id = urllib.parse.unquote(self._match_id(url)).lower() + webpage = self._download_webpage(url, playlist_id) + json_content = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', playlist_id) + + def entries(): + for video_id in traverse_obj(json_content, ( + 'video', 'clipsByCategory', lambda k, _: k.lower() == playlist_id, 'items', ..., 'id' + )): + yield self.url_result(f'https://www.cbc.ca/player/play/{video_id}', CBCPlayerIE) + + return self.playlist_result(entries(), playlist_id) + + class CBCGemIE(InfoExtractor): IE_NAME = 'gem.cbc.ca' _VALID_URL = r'https?://gem\.cbc\.ca/(?:media/)?(?P[0-9a-z-]+/s[0-9]+[a-z][0-9]+)'