Merge pull request #57 from insaneracist/youtube-mix-fix

[youtube] fix: extract mix playlist ids from ytInitialData (#33)
2024-11-23 19:33:59 +01:00 · 2020-11-03 10:33:58 +01:00 · 2020-11-03 10:33:58 +01:00 · 7166f47b18
commit 7166f47b18
parent 471115dbee 5c15c1a0d7
1 changed files with 26 additions and 9 deletions
--- a/youtube_dlc/extractor/youtube.py
+++ b/youtube_dlc/extractor/youtube.py
@ -279,6 +279,15 @@ def _download_webpage_handle(self, *args, **kwargs):
        return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
            *args, **compat_kwargs(kwargs))
    def _get_yt_initial_data(self, video_id, webpage):
        config = self._search_regex(
            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
            webpage, 'ytInitialData', default=None)
        if config:
            return self._parse_json(
                uppercase_escape(config), video_id, fatal=False)
    def _real_initialize(self):
        if self._downloader is None:
            return
@ -1398,15 +1407,6 @@ def _get_ytplayer_config(self, video_id, webpage):
            return self._parse_json(
                uppercase_escape(config), video_id, fatal=False)
    def _get_yt_initial_data(self, video_id, webpage):
        config = self._search_regex(
            (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
             r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
            webpage, 'ytInitialData', default=None)
        if config:
            return self._parse_json(
                uppercase_escape(config), video_id, fatal=False)
    def _get_music_metadata_from_yt_initial(self, yt_initial):
        music_metadata = []
        key_map = {
@ -2828,6 +2828,16 @@ def extract_videos_from_page(self, page):
        return zip(ids_in_page, titles_in_page)
    def _extract_mix_ids_from_yt_initial(self, yt_initial):
        ids = []
        playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
        if playlist_contents:
            for item in playlist_contents:
                videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
                if videoId:
                    ids.append(videoId)
        return ids
    def _extract_mix(self, playlist_id):
        # The mixes are generated from a single video
        # the id of the playlist is just 'RD' + video_id
@ -2841,6 +2851,13 @@ def _extract_mix(self, playlist_id):
                r'''(?xs)data-video-username=".*?".*?
                           href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
                webpage))
            # if no ids in html of page, try using embedded json
            if (len(new_ids) == 0):
                yt_initial = self._get_yt_initial_data(playlist_id, webpage)
                if yt_initial:
                    new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
            # Fetch new pages until all the videos are repeated, it seems that
            # there are always 51 unique videos.
            new_ids = [_id for _id in new_ids if _id not in ids]