From ccbbd37dca50aca6fe3144976af4f7378aa59470 Mon Sep 17 00:00:00 2001 From: subrat-lima Date: Tue, 27 Aug 2024 02:08:58 +0530 Subject: [PATCH 1/2] added support for jiosaavn artist playlist download --- yt_dlp/extractor/_extractors.py | 1 + yt_dlp/extractor/jiosaavn.py | 78 ++++++++++++++++++++++++++++++++- 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 9b73fcd75e..8a3753b3ba 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -917,6 +917,7 @@ ) from .jiosaavn import ( JioSaavnAlbumIE, + JioSaavnArtistIE, JioSaavnPlaylistIE, JioSaavnSongIE, ) diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 030fe686bd..38148df6d1 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -8,6 +8,7 @@ clean_html, int_or_none, make_archive_id, + parse_duration, smuggle_url, unsmuggle_url, url_basename, @@ -172,14 +173,14 @@ class JioSaavnPlaylistIE(JioSaavnBaseIE): 'id': 'DVR,pFUOwyXqIp77B1JF,A__', 'title': 'Mood Hindi', }, - 'playlist_mincount': 801, + 'playlist_mincount': 750, }, { 'url': 'https://www.jiosaavn.com/featured/taaza-tunes/Me5RridRfDk_', 'info_dict': { 'id': 'Me5RridRfDk_', 'title': 'Taaza Tunes', }, - 'playlist_mincount': 301, + 'playlist_mincount': 50, }] _PAGE_SIZE = 50 @@ -199,3 +200,76 @@ def _real_extract(self, url): return self.playlist_result(InAdvancePagedList( functools.partial(self._entries, display_id, playlist_data), total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) + + +class JioSaavnArtistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:artist' + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/artist/[^/?#]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/artist/krsna-songs/rYLBEve2z3U_', + 'info_dict': { + 'id': 'rYLBEve2z3U_', + 'title': 'KR$NA', + }, + 'playlist_mincount': 99, + }, { + 'url': 'https://www.jiosaavn.com/artist/sanam-puri-songs/SkNEv3qRhDE_', + 'info_dict': { + 'id': 'SkNEv3qRhDE_', + 'title': 'Sanam Puri', + }, + 'playlist_mincount': 55, + }] + _PAGE_SIZE = 50 + + def _fetch_page(self, token, page): + return self._call_api('artist', token, f'artist page {page}', { + 'p': page, 'n': self._PAGE_SIZE, 'api_version': '4', 'category': 'alphabetical', 'sort_order': 'asc'}) + + def _extract_song(self, song_data, url=None): + info = traverse_obj(song_data, { + 'id': ('id', {str}), + 'title': ('title', {clean_html}), + 'album': ('more_info', 'album', {clean_html}), + 'thumbnail': ('image', {clean_html}), + 'duration': ('more_info', 'duration', {parse_duration}), + 'release_year': ('year', {int_or_none}), + 'artists': ('more_info', 'artistMap', 'primary_artists', {lambda x: x['name']}), + 'webpage_url': ('perma_url', {url_or_none}), + }) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + + return info + + def _yield_songs(self, playlist_data): + for song_data in traverse_obj(playlist_data, ('topSongs')): + song_info = self._extract_song(song_data) + url = smuggle_url(song_info['webpage_url'], { + 'id': song_data['id'], + 'encrypted_media_url': song_data['more_info']['encrypted_media_url'], + }) + yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + + def _entries(self, token, page): + page_data = self._fetch_page(token, page) + yield from self._yield_songs(page_data) + + def _generate_result(self, token): + pagenum = 0 + result = [] + while True: + entries = list(self._entries(token, pagenum)) + if len(entries) == 0: + break + result.extend(entries) + pagenum += 1 + return result + + def _real_extract(self, url): + artist_token_id = self._match_id(url) + artist_playlist_entries = self._generate_result(artist_token_id) + name = self._fetch_page(artist_token_id, 0).get('name') + + return self.playlist_result(artist_playlist_entries, artist_token_id, name) From bfcb22aca13170244578e48e94abc37b1e57dc91 Mon Sep 17 00:00:00 2001 From: subrat-lima Date: Tue, 27 Aug 2024 10:49:08 +0530 Subject: [PATCH 2/2] Refactored code to avoid potentials bugs Made the following changes: 1. added static page limit to avoid potential infinite loop scenario 2. added comments to explain more about the artist page results 3. updated artist page _call_api params to match browser request 4. refactored code to remove extra first page api call for playlist name --- yt_dlp/extractor/jiosaavn.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index 38148df6d1..759406d05d 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -224,7 +224,8 @@ class JioSaavnArtistIE(JioSaavnBaseIE): def _fetch_page(self, token, page): return self._call_api('artist', token, f'artist page {page}', { - 'p': page, 'n': self._PAGE_SIZE, 'api_version': '4', 'category': 'alphabetical', 'sort_order': 'asc'}) + 'p': page, 'n_song': self._PAGE_SIZE, 'n_album': self._PAGE_SIZE, 'sub_type': '', + 'includeMetaTags': '', 'api_version': '4', 'category': 'alphabetical', 'sort_order': 'asc'}) def _extract_song(self, song_data, url=None): info = traverse_obj(song_data, { @@ -253,23 +254,29 @@ def _yield_songs(self, playlist_data): yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) def _entries(self, token, page): - page_data = self._fetch_page(token, page) + page_data = self._first_page if page == 0 else self._fetch_page(token, page) yield from self._yield_songs(page_data) def _generate_result(self, token): - pagenum = 0 + # note: + # 1. the total number of songs in a page result is not constant + # 2. end of list is identified by 'topSongs' array being empty + page = 0 result = [] - while True: - entries = list(self._entries(token, pagenum)) + + # added static page count limit to avoid potential infinite loop + while page < 20000: + entries = list(self._entries(token, page)) if len(entries) == 0: break result.extend(entries) - pagenum += 1 + page += 1 return result def _real_extract(self, url): - artist_token_id = self._match_id(url) - artist_playlist_entries = self._generate_result(artist_token_id) - name = self._fetch_page(artist_token_id, 0).get('name') + display_id = self._match_id(url) + self._first_page = self._fetch_page(display_id, 0) + entries = self._generate_result(display_id) + name = self._first_page.get('name') - return self.playlist_result(artist_playlist_entries, artist_token_id, name) + return self.playlist_result(entries, display_id, name)