From d816fb28dcdfc27fabfa848a6c1edf20f08b7356 Mon Sep 17 00:00:00 2001 From: Kieran Eglin Date: Thu, 9 May 2024 09:39:23 -0700 Subject: [PATCH] Added 30 day singer extractor --- yt_dlp/extractor/_extractors.py | 4 ++ yt_dlp/extractor/thirtydaysinger.py | 102 ++++++++++++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 yt_dlp/extractor/thirtydaysinger.py diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 42034275b..e3824c445 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -1981,6 +1981,10 @@ from .thestar import TheStarIE from .thesun import TheSunIE from .theweatherchannel import TheWeatherChannelIE +from .thirtydaysinger import ( + ThirtyDaySingerIE, + ThirtyDaySingerPlaylistIE +) from .thisamericanlife import ThisAmericanLifeIE from .thisoldhouse import ThisOldHouseIE from .thisvid import ( diff --git a/yt_dlp/extractor/thirtydaysinger.py b/yt_dlp/extractor/thirtydaysinger.py new file mode 100644 index 000000000..43da6210a --- /dev/null +++ b/yt_dlp/extractor/thirtydaysinger.py @@ -0,0 +1,102 @@ +from .wistia import WistiaIE +from ..utils import ( + clean_html, + get_elements_html_by_class +) + + +class ThirtyDaySingerBase(WistiaIE): + def _extract_for_url(self, url): + lesson_index = self._match_id(url) + webpage = self._download_webpage(url, lesson_index) + match = next(self._extract_wistia_async_embed(webpage)) + embed_config = self._download_embed_config('medias', match.group('id'), url) + + embed_infojson = self._extract_media(embed_config) + webpage_infojson = self._extract_webpage_data(webpage) + + return {**embed_infojson, **webpage_infojson} + + def _extract_webpage_data(self, webpage): + title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') + fallback_title = self._html_extract_title(webpage) + description = self._html_search_meta('description', webpage, fatal=False) + + return { + 'title': title or fallback_title, + 'description': clean_html(self._format_html_list(description)) + } + + # The site makes extensive use of HTML lists for formatting and `clean_html` + # doesn't handle them well. This is needed to keep lists readable. + def _format_html_list(self, html): + replacements = { + '