From 4e17e8eec2fd70296be7f957ac018f2e3c36228b Mon Sep 17 00:00:00 2001 From: marieell Date: Mon, 5 Aug 2024 00:07:17 +0200 Subject: [PATCH 1/4] [ie/ARD] Add Audiothek (#5605) --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/ard.py | 116 ++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9b73fcd75e..30a5945b59 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -149,6 +149,7 @@ from .arcpublishing import ArcPublishingIE from .ard import ( ARDIE, + ARDAudiothekIE, ARDBetaMediathekIE, ARDMediathekCollectionIE, ) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 6fd6413479..bf81e0994a 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -1,4 +1,5 @@ import functools +import json import re from .common import InfoExtractor @@ -577,3 +578,118 @@ def fetch_page(page_num): return self.playlist_result( OnDemandPagedList(fetch_page, self._PAGE_SIZE), full_id, display_id=display_id, title=page_data.get('title'), description=page_data.get('synopsis')) + + +class ARDAudiothekIE(InfoExtractor): + IE_NAME = 'ARD:audiothek' + _VALID_URL = r'''(?x)https:// + (?:www\.)?ardaudiothek\.de/ + (?:player|live|episode|(?Psendung|serie|sammlung))/ + (?P(?(playlist)[^?#]+?|[^?#]+))/ + (?P[a-zA-Z0-9]+) + (?(playlist)/(?P\d+)?/?(?:[?#]|$))''' + + _TESTS = [{ + 'url': 'https://www.ardaudiothek.de/sendung/1live-caiman-club/53375276/', + 'info_dict': { + 'id': '53375276', + 'title': '1LIVE Caiman Club', + 'description': 'md5:003cff043a41b14cf045b960b89aaa86', + }, + 'playlist_mincount': 22, + }, { + 'url': 'https://www.ardaudiothek.de/episode/1live-caiman-club/caiman-club-s04e04-cash-out/1live/13556081/', + 'info_dict': { + 'id': '13556081', + 'ext': 'mp3', + 'upload_date': '20240717', + 'duration': 3339, + 'title': 'CAIMAN CLUB (S04E04): Cash Out', + 'thumbnail': 'https://api.ardmediathek.de/image-service/images/urn:ard:image:d5014b612429c396', + 'description': 'md5:8decf7974ed1cbf5a9d2c537940e1c4b', + 'display_id': '1live-caiman-club/caiman-club-s04e04-cash-out/1live', + 'timestamp': 1721181641, + 'series': '1LIVE Caiman Club', + + }, + }] + + _QUERY_PLAYLIST = '''show(id: "%s") { + title + description + items { + nodes { + url + episodeNumber + grouping + isPublished + } + } + }''' + + _QUERY_ITEM = '''item(id: "%s") { + audioList { + href + distributionType + } + show { + title + } + image { + url + } + synopsis + title + duration + startDate + }''' + + _GRAPHQL_ENDPOINT = 'https://api.ardaudiothek.de/graphql' + + def _graphql_query(self, display_id, query): + return self._download_json( + self._GRAPHQL_ENDPOINT, + display_id, + data=json.dumps({'query': '{' + query + '}'}).encode(), + headers={ + 'Content-Type': 'application/json', + }, + )['data'] + + def _real_extract(self, url): + video_id, display_id, playlist_type, season_number = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'season') + if re.match('^[/-]*$', display_id): + display_id = video_id + + if playlist_type: + playlist_info = self.graphql_query(display_id, self._QUERY_PLAYLIST % video_id)['show'] + episodes = playlist_info['items']['nodes'] + entries = [] + for episode in episodes: + if episode['isPublished']: + entries.append(self.url_result( + episode['url'], + ie=ARDAudiothekIE.ie_key())) + data = self.playlist_result(entries, video_id, playlist_title=display_id) + data.update({ + 'title': playlist_info.get('title'), + 'description': playlist_info.get('description'), + }) + return data + + item = self.graphql_query(display_id, self._QUERY_ITEM % video_id)['item'] + audio_list = item.get('audioList', []) + return { + 'display_id': display_id, + 'formats': [{'url': x['href'], 'format_id': x['distributionType']} for x in audio_list], + 'id': video_id, + **traverse_obj(item, ({ + 'description': 'synopsis', + 'duration': ('duration', {int_or_none}), + 'series': ('show', 'title'), + 'thumbnail': ('image', 'url', {url_or_none}), + 'timestamp': ('startDate', {parse_iso8601}), + 'title': 'title', + })), + } From 75ff02cf4e64baa32b3e076cb11188b9eec5ec51 Mon Sep 17 00:00:00 2001 From: marieell Date: Tue, 13 Aug 2024 00:15:25 +0200 Subject: [PATCH 2/4] [ie/ARD] Review feedback Co-authored-by: Simon Sawicki --- yt_dlp/extractor/ard.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index bf81e0994a..21763ececa 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -627,7 +627,8 @@ class ARDAudiothekIE(InfoExtractor): } }''' - _QUERY_ITEM = '''item(id: "%s") { + _QUERY_ITEM = '''\ + item(id: "%s") { audioList { href distributionType @@ -671,12 +672,10 @@ def _real_extract(self, url): entries.append(self.url_result( episode['url'], ie=ARDAudiothekIE.ie_key())) - data = self.playlist_result(entries, video_id, playlist_title=display_id) - data.update({ - 'title': playlist_info.get('title'), - 'description': playlist_info.get('description'), - }) - return data + return self.playlist_result(entries, video_id, playlist_title=display_id, **traverse_obj(playlist_info, { + 'title': ('title', {str}), + 'description': ('description', {str}), + })) item = self.graphql_query(display_id, self._QUERY_ITEM % video_id)['item'] audio_list = item.get('audioList', []) @@ -684,12 +683,12 @@ def _real_extract(self, url): 'display_id': display_id, 'formats': [{'url': x['href'], 'format_id': x['distributionType']} for x in audio_list], 'id': video_id, - **traverse_obj(item, ({ - 'description': 'synopsis', + **traverse_obj(item, { + 'description': ('synopsis', {str}), 'duration': ('duration', {int_or_none}), 'series': ('show', 'title'), 'thumbnail': ('image', 'url', {url_or_none}), 'timestamp': ('startDate', {parse_iso8601}), - 'title': 'title', - })), + 'title': ('title', {str}), + }), } From a434c7b7e23471a487d6b7fd5bccd34e35dae51c Mon Sep 17 00:00:00 2001 From: marieell Date: Wed, 14 Aug 2024 23:13:21 +0200 Subject: [PATCH 3/4] [ie/ARD] Review feedback Co-authored-by: Simon Sawicki --- yt_dlp/extractor/ard.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 21763ececa..54666ecfaf 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -614,7 +614,8 @@ class ARDAudiothekIE(InfoExtractor): }, }] - _QUERY_PLAYLIST = '''show(id: "%s") { + _QUERY_PLAYLIST = '''\ + show(id: "%s") { title description items { From ddc7e9a1bbf91741283d9b380eec45b6a58d17b2 Mon Sep 17 00:00:00 2001 From: marieell Date: Wed, 28 Aug 2024 22:58:11 +0200 Subject: [PATCH 4/4] [ie/ARD] Use traverse_obj Co-authored-by: Simon Sawicki --- yt_dlp/extractor/ard.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 54666ecfaf..795b10df92 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -678,11 +678,13 @@ def _real_extract(self, url): 'description': ('description', {str}), })) - item = self.graphql_query(display_id, self._QUERY_ITEM % video_id)['item'] - audio_list = item.get('audioList', []) return { 'display_id': display_id, - 'formats': [{'url': x['href'], 'format_id': x['distributionType']} for x in audio_list], + 'formats': traverse_obj(self.graphql_query(display_id, self._QUERY_ITEM % video_id), ( + 'item', 'audioList', lambda _, v: url_or_none(v['href']), { + 'url': 'href', + 'format_id': ('distributionType', {str}), + })), 'id': video_id, **traverse_obj(item, { 'description': ('synopsis', {str}),