From 960b8931c6bedffc186ddbc576f62089f0d65234 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Wed, 15 May 2024 16:39:56 +0100
Subject: [PATCH 01/16] Fix podcast and person playlist downloads

---
 yt_dlp/extractor/radiofrance.py | 104 +++++++++++++++++++++++---------
 1 file changed, 77 insertions(+), 27 deletions(-)
diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 6bd6fe9b68..3ada312ba5 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -267,10 +267,10 @@ def _real_extract(self, url):
 class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
     """Subclasses must set _METADATA_KEY"""
 
-    def _call_api(self, content_id, cursor, page_num):
+    def _call_api(self, station, content_id, cursor):
         raise NotImplementedError('This method must be implemented by subclasses')
 
-    def _generate_playlist_entries(self, content_id, content_response):
+    def _generate_playlist_entries(self, station, content_id, content_response):
         for page_num in itertools.count(2):
             for entry in content_response['items']:
                 yield self.url_result(
@@ -281,28 +281,39 @@ def _generate_playlist_entries(self, content_id, content_response):
                         'thumbnail': ('visual', 'src'),
                     }))
 
-            next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
-            if not next_cursor:
+            if not content_response["next"]:
                 break
 
-            content_response = self._call_api(content_id, next_cursor, page_num)
+            content_response = self._call_api(station, content_id, content_response["next"])
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
+        playlist_id = self._match_id(url)
+        # If it is a podcast playlist, get the name of the station it is on
+        # profile page playlists are not attached to a station currently
+        station = self._match_valid_url(url).group('station') if isinstance(self, RadioFrancePodcastIE) else None
 
-        metadata = self._download_json(
-            'https://www.radiofrance.fr/api/v2.1/path', display_id,
-            query={'value': urllib.parse.urlparse(url).path})['content']
+        # Check if user started request from a page other than 1
+        startpage = 1
+        parsedurl = urllib.parse.urlparse(url)
+        if parsedurl.query:
+            startpagequery = urllib.parse.parse_qs(parsedurl.query)
+            if 'p' in startpagequery:
+                startpage = int(startpagequery['p'][0])
 
-        content_id = metadata['id']
+        # Get data for the first page, and the uuid for the playlist
+        metadata = self._call_api(station, playlist_id, startpage)
+        uuid = traverse_obj(metadata, ('metadata', 'id'))
 
+        # This method should return the final playlist metadata which yt-dlp can then use to download everything
         return self.playlist_result(
-            self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
-            display_id=display_id, **{**traverse_obj(metadata, {
+            self._generate_playlist_entries(station, playlist_id, metadata),
+            uuid,
+            display_id=playlist_id,
+            **{**traverse_obj(metadata['metadata'], {
                 'title': 'title',
                 'description': 'standFirst',
                 'thumbnail': ('visual', 'src'),
-            }), **traverse_obj(metadata, {
+            }), **traverse_obj(metadata['metadata'], {
                 'title': 'name',
                 'description': 'role',
             })})
@@ -311,7 +322,7 @@ def _real_extract(self, url):
 class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
     _VALID_URL = rf'''(?x)
         {RadioFranceBaseIE._VALID_URL_BASE}
-        /(?:{RadioFranceBaseIE._STATIONS_RE})
+        /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
         /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
     '''
 
@@ -363,10 +374,27 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
 
     _METADATA_KEY = 'expressions'
 
-    def _call_api(self, podcast_id, cursor, page_num):
-        return self._download_json(
-            f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
-            note=f'Downloading page {page_num}', query={'pageCursor': cursor})
+    def _call_api(self, station, podcast_id, cursor):
+        # The data is stored in the last <script> tag on a page
+        url = 'https://www.radiofrance.fr/' + station + '/podcasts/' + podcast_id + '?p=' + str(cursor)
+        webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
+
+        resp = dict()
+
+        # _search_json cannot parse the data as it contains javascript
+        # Therefore, parse the episodes objects array separately
+        resp['items'] = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
+                                          contains_pattern=r'\[.+\]', transform_source=js_to_json)
+
+        # the pagination data is stored in a javascript object 'a'
+        lastPage = int(re.search(r'a\.lastPage\s*=\s*(\d+);', webpage).group(1))
+        hasMorePages = cursor < lastPage
+        resp['next'] = cursor + 1 if hasMorePages else None
+
+        resp['metadata'] = self._search_json(r'content:\s*', webpage, podcast_id, podcast_id,
+                                             transform_source=js_to_json)
+
+        return resp
 
 
 class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
@@ -380,7 +408,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
             'title': 'Thomas Pesquet',
             'description': 'Astronaute à l\'agence spatiale européenne',
         },
-        'playlist_mincount': 212,
+        'playlist_mincount': 158,
     }, {
         'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
         'info_dict': {
@@ -398,15 +426,37 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
 
     _METADATA_KEY = 'documents'
 
-    def _call_api(self, profile_id, cursor, page_num):
-        resp = self._download_json(
-            f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
-            note=f'Downloading page {page_num}', query={
-                'relation': 'personality',
-                'cursor': cursor,
-            })
+    def _call_api(self, station, profile_id, cursor):
+        url = 'https://www.radiofrance.fr/personnes/' + profile_id + '?p=' + str(cursor)
+        webpage = self._download_webpage(url, profile_id, note=f'Downloading {profile_id} page {cursor}')
+
+        resp = dict()
+
+        # On profile pages, the data is stored in a javascript array in the final <script>
+        # Each episode is stored as
+        # a[0] = { id: ... }; a[1] = [ id: ... ]; etc.
+        # Annoyingly, sometimes it is delivered using 'b', with 'a' holding metadata
+        resp['items'] = []
+        podcastindex = 0
+        nextmatch = True
+        while nextmatch:
+            nextmatch = self._search_json(r'\w+\[' + str(podcastindex) + r'\]\s*=\s*', webpage, profile_id,
+                                          profile_id, transform_source=js_to_json, fatal=False, default=None)
+            podcastindex += 1
+            if nextmatch is not None:
+                resp['items'].append(nextmatch)
+
+        # There is more than one pagination key in the final <script>
+        # We should use pick the pagination object which is within a documents object
+        pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
+                                     transform_source=js_to_json)
+        lastPage = traverse_obj(pagedata, ('pagination', 'lastPage'))
+        hasMorePages = cursor < lastPage
+        resp['next'] = cursor + 1 if hasMorePages else None
+
+        resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,
+                                             transform_source=js_to_json)
 
-        resp['next'] = traverse_obj(resp, ('pagination', 'next'))
         return resp
 
 

From e2243c20333a434f5e23f2dddc2d58ea184d9488 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Wed, 15 May 2024 16:39:56 +0100
Subject: [PATCH 02/16] [RadioFrance] Fix podcast and person playlist downloads

---
 yt_dlp/extractor/radiofrance.py | 104 +++++++++++++++++++++++---------
 1 file changed, 77 insertions(+), 27 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 6bd6fe9b68..3ada312ba5 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -267,10 +267,10 @@ def _real_extract(self, url):
 class RadioFrancePlaylistBaseIE(RadioFranceBaseIE):
     """Subclasses must set _METADATA_KEY"""
 
-    def _call_api(self, content_id, cursor, page_num):
+    def _call_api(self, station, content_id, cursor):
         raise NotImplementedError('This method must be implemented by subclasses')
 
-    def _generate_playlist_entries(self, content_id, content_response):
+    def _generate_playlist_entries(self, station, content_id, content_response):
         for page_num in itertools.count(2):
             for entry in content_response['items']:
                 yield self.url_result(
@@ -281,28 +281,39 @@ def _generate_playlist_entries(self, content_id, content_response):
                         'thumbnail': ('visual', 'src'),
                     }))
 
-            next_cursor = traverse_obj(content_response, (('pagination', None), 'next'), get_all=False)
-            if not next_cursor:
+            if not content_response["next"]:
                 break
 
-            content_response = self._call_api(content_id, next_cursor, page_num)
+            content_response = self._call_api(station, content_id, content_response["next"])
 
     def _real_extract(self, url):
-        display_id = self._match_id(url)
+        playlist_id = self._match_id(url)
+        # If it is a podcast playlist, get the name of the station it is on
+        # profile page playlists are not attached to a station currently
+        station = self._match_valid_url(url).group('station') if isinstance(self, RadioFrancePodcastIE) else None
 
-        metadata = self._download_json(
-            'https://www.radiofrance.fr/api/v2.1/path', display_id,
-            query={'value': urllib.parse.urlparse(url).path})['content']
+        # Check if user started request from a page other than 1
+        startpage = 1
+        parsedurl = urllib.parse.urlparse(url)
+        if parsedurl.query:
+            startpagequery = urllib.parse.parse_qs(parsedurl.query)
+            if 'p' in startpagequery:
+                startpage = int(startpagequery['p'][0])
 
-        content_id = metadata['id']
+        # Get data for the first page, and the uuid for the playlist
+        metadata = self._call_api(station, playlist_id, startpage)
+        uuid = traverse_obj(metadata, ('metadata', 'id'))
 
+        # This method should return the final playlist metadata which yt-dlp can then use to download everything
         return self.playlist_result(
-            self._generate_playlist_entries(content_id, metadata[self._METADATA_KEY]), content_id,
-            display_id=display_id, **{**traverse_obj(metadata, {
+            self._generate_playlist_entries(station, playlist_id, metadata),
+            uuid,
+            display_id=playlist_id,
+            **{**traverse_obj(metadata['metadata'], {
                 'title': 'title',
                 'description': 'standFirst',
                 'thumbnail': ('visual', 'src'),
-            }), **traverse_obj(metadata, {
+            }), **traverse_obj(metadata['metadata'], {
                 'title': 'name',
                 'description': 'role',
             })})
@@ -311,7 +322,7 @@ def _real_extract(self, url):
 class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
     _VALID_URL = rf'''(?x)
         {RadioFranceBaseIE._VALID_URL_BASE}
-        /(?:{RadioFranceBaseIE._STATIONS_RE})
+        /(?P<station>{RadioFranceBaseIE._STATIONS_RE})
         /podcasts/(?P<id>[\w-]+)/?(?:[?#]|$)
     '''
 
@@ -363,10 +374,27 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
 
     _METADATA_KEY = 'expressions'
 
-    def _call_api(self, podcast_id, cursor, page_num):
-        return self._download_json(
-            f'https://www.radiofrance.fr/api/v2.1/concepts/{podcast_id}/expressions', podcast_id,
-            note=f'Downloading page {page_num}', query={'pageCursor': cursor})
+    def _call_api(self, station, podcast_id, cursor):
+        # The data is stored in the last <script> tag on a page
+        url = 'https://www.radiofrance.fr/' + station + '/podcasts/' + podcast_id + '?p=' + str(cursor)
+        webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
+
+        resp = dict()
+
+        # _search_json cannot parse the data as it contains javascript
+        # Therefore, parse the episodes objects array separately
+        resp['items'] = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
+                                          contains_pattern=r'\[.+\]', transform_source=js_to_json)
+
+        # the pagination data is stored in a javascript object 'a'
+        lastPage = int(re.search(r'a\.lastPage\s*=\s*(\d+);', webpage).group(1))
+        hasMorePages = cursor < lastPage
+        resp['next'] = cursor + 1 if hasMorePages else None
+
+        resp['metadata'] = self._search_json(r'content:\s*', webpage, podcast_id, podcast_id,
+                                             transform_source=js_to_json)
+
+        return resp
 
 
 class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
@@ -380,7 +408,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
             'title': 'Thomas Pesquet',
             'description': 'Astronaute à l\'agence spatiale européenne',
         },
-        'playlist_mincount': 212,
+        'playlist_mincount': 158,
     }, {
         'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
         'info_dict': {
@@ -398,15 +426,37 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
 
     _METADATA_KEY = 'documents'
 
-    def _call_api(self, profile_id, cursor, page_num):
-        resp = self._download_json(
-            f'https://www.radiofrance.fr/api/v2.1/taxonomy/{profile_id}/documents', profile_id,
-            note=f'Downloading page {page_num}', query={
-                'relation': 'personality',
-                'cursor': cursor,
-            })
+    def _call_api(self, station, profile_id, cursor):
+        url = 'https://www.radiofrance.fr/personnes/' + profile_id + '?p=' + str(cursor)
+        webpage = self._download_webpage(url, profile_id, note=f'Downloading {profile_id} page {cursor}')
+
+        resp = dict()
+
+        # On profile pages, the data is stored in a javascript array in the final <script>
+        # Each episode is stored as
+        # a[0] = { id: ... }; a[1] = [ id: ... ]; etc.
+        # Annoyingly, sometimes it is delivered using 'b', with 'a' holding metadata
+        resp['items'] = []
+        podcastindex = 0
+        nextmatch = True
+        while nextmatch:
+            nextmatch = self._search_json(r'\w+\[' + str(podcastindex) + r'\]\s*=\s*', webpage, profile_id,
+                                          profile_id, transform_source=js_to_json, fatal=False, default=None)
+            podcastindex += 1
+            if nextmatch is not None:
+                resp['items'].append(nextmatch)
+
+        # There is more than one pagination key in the final <script>
+        # We should use pick the pagination object which is within a documents object
+        pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
+                                     transform_source=js_to_json)
+        lastPage = traverse_obj(pagedata, ('pagination', 'lastPage'))
+        hasMorePages = cursor < lastPage
+        resp['next'] = cursor + 1 if hasMorePages else None
+
+        resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,
+                                             transform_source=js_to_json)
 
-        resp['next'] = traverse_obj(resp, ('pagination', 'next'))
         return resp
 
 

From 827560f2b994cf4d32f2f8579648938d85458297 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Thu, 16 May 2024 10:47:28 +0100
Subject: [PATCH 03/16] [RadioFrance] Ep selection is already handled, don't
 add it here

---
 yt_dlp/extractor/radiofrance.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 3ada312ba5..b9e01c7898 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -292,19 +292,10 @@ def _real_extract(self, url):
         # profile page playlists are not attached to a station currently
         station = self._match_valid_url(url).group('station') if isinstance(self, RadioFrancePodcastIE) else None
 
-        # Check if user started request from a page other than 1
-        startpage = 1
-        parsedurl = urllib.parse.urlparse(url)
-        if parsedurl.query:
-            startpagequery = urllib.parse.parse_qs(parsedurl.query)
-            if 'p' in startpagequery:
-                startpage = int(startpagequery['p'][0])
-
         # Get data for the first page, and the uuid for the playlist
-        metadata = self._call_api(station, playlist_id, startpage)
+        metadata = self._call_api(station, playlist_id, 1)
         uuid = traverse_obj(metadata, ('metadata', 'id'))
 
-        # This method should return the final playlist metadata which yt-dlp can then use to download everything
         return self.playlist_result(
             self._generate_playlist_entries(station, playlist_id, metadata),
             uuid,
@@ -408,7 +399,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
             'title': 'Thomas Pesquet',
             'description': 'Astronaute à l\'agence spatiale européenne',
         },
-        'playlist_mincount': 158,
+        'playlist_mincount': 212,
     }, {
         'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
         'info_dict': {

From a8edca98f53a9cbc1dc0352cf1d464eb8fbabcbe Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Thu, 16 May 2024 10:59:56 +0100
Subject: [PATCH 04/16] [RadioFrance] Fix live substations

---
 yt_dlp/extractor/radiofrance.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index b9e01c7898..e1c6fc5110 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -237,7 +237,8 @@ def _real_extract(self, url):
 
         if substation_id:
             webpage = self._download_webpage(url, station_id)
-            api_response = self._extract_data_from_webpage(webpage, station_id, 'webRadioData')
+            api_response = self._search_json(r'webradioLive:\s*', webpage, station_id, substation_id,
+                                             transform_source=js_to_json)
         else:
             api_response = self._download_json(
                 f'https://www.radiofrance.fr/{station_id}/api/live', station_id)

From 1f719e1934fe638077585d5af14d86eeaff461e5 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Thu, 16 May 2024 11:00:08 +0100
Subject: [PATCH 05/16] [RadioFrance] Cleanup imports

---
 yt_dlp/extractor/radiofrance.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index e1c6fc5110..cc0124d836 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -1,6 +1,5 @@
 import itertools
 import re
-import urllib.parse
 
 from .common import InfoExtractor
 from ..utils import (

From 7308dc895c73acf71c4293f41f3d54371fd367c5 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Thu, 16 May 2024 11:29:16 +0100
Subject: [PATCH 06/16] [RadioFrance] Fix outdated tests

---
 yt_dlp/extractor/radiofrance.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index cc0124d836..449fb7e74e 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -328,15 +328,15 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
         },
         'playlist_mincount': 11,
     }, {
-        'url': 'https://www.radiofrance.fr/franceinter/podcasts/jean-marie-le-pen-l-obsession-nationale',
+        'url': 'https://www.radiofrance.fr/franceinter/podcasts/avec-la-langue',
         'info_dict': {
-            'id': '566fd524-3074-4fbc-ac69-8696f2152a54',
-            'display_id': 'jean-marie-le-pen-l-obsession-nationale',
-            'title': 'Jean-Marie Le Pen, l\'obsession nationale',
-            'description': 'md5:a07c0cfb894f6d07a62d0ad12c4b7d73',
+            'id': '53a95989-7c61-48c7-873c-6a71009101bb',
+            'display_id': 'avec-la-langue',
+            'title': 'Avec la langue',
+            'description': 'md5:4ddb6d4ed46dbbdee611b8e16e4af868',
             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
         },
-        'playlist_count': 7,
+        'playlist_mincount': 36,
     }, {
         'url': 'https://www.radiofrance.fr/franceculture/podcasts/serie-thomas-grjebine',
         'info_dict': {
@@ -351,7 +351,7 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
             'id': '143dff38-e956-4a5d-8576-1c0b7242b99e',
             'display_id': 'certains-l-aiment-fip',
             'title': 'Certains l’aiment Fip',
-            'description': 'md5:ff974672ba00d4fd5be80fb001c5b27e',
+            'description': 'md5:7c373cdcec7a024f12fa34de7612e44e',
             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
         },
         'playlist_mincount': 321,
@@ -399,7 +399,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
             'title': 'Thomas Pesquet',
             'description': 'Astronaute à l\'agence spatiale européenne',
         },
-        'playlist_mincount': 212,
+        'playlist_mincount': 100,
     }, {
         'url': 'https://www.radiofrance.fr/personnes/eugenie-bastie',
         'info_dict': {

From e5e91ad05d16f2b74afee0c41fd6360048dbf03c Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Thu, 16 May 2024 11:29:32 +0100
Subject: [PATCH 07/16] [RadioFrance] Fix thumb detection on profiles

---
 yt_dlp/extractor/radiofrance.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 449fb7e74e..b8d808abfa 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -425,8 +425,9 @@ def _call_api(self, station, profile_id, cursor):
 
         # On profile pages, the data is stored in a javascript array in the final <script>
         # Each episode is stored as
-        # a[0] = { id: ... }; a[1] = [ id: ... ]; etc.
-        # Annoyingly, sometimes it is delivered using 'b', with 'a' holding metadata
+        # a[0] = { id: ... }; a[1] = [ id: ... ]; on page 2->
+        # If a page had a thumbnail, the a variable contains image data,
+        # and episode data is stored in b[0]...
         resp['items'] = []
         podcastindex = 0
         nextmatch = True
@@ -447,6 +448,11 @@ def _call_api(self, station, profile_id, cursor):
 
         resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,
                                              transform_source=js_to_json)
+        # If the image data is stored separately rather than in the main content area
+        if resp['metadata']['visual'] and isinstance(resp['metadata']['visual'], str):
+            imagedata = dict()
+            imagedata['src'] = self._og_search_thumbnail(webpage)
+            resp['metadata']['visual'] = imagedata
 
         return resp
 

From dd74aa0bca55c33b0f40185101023c46cd420308 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Thu, 16 May 2024 11:45:17 +0100
Subject: [PATCH 08/16] [RadioFrance] Fix quote styling

---
 yt_dlp/extractor/radiofrance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index b8d808abfa..bcc2fa0a78 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -281,10 +281,10 @@ def _generate_playlist_entries(self, station, content_id, content_response):
                         'thumbnail': ('visual', 'src'),
                     }))
 
-            if not content_response["next"]:
+            if not content_response['next']:
                 break
 
-            content_response = self._call_api(station, content_id, content_response["next"])
+            content_response = self._call_api(station, content_id, content_response['next'])
 
     def _real_extract(self, url):
         playlist_id = self._match_id(url)

From 867bf965bbe7c30a4fc0d4964dee9957d140a0a1 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 14:23:47 +0100
Subject: [PATCH 09/16] [RadioFrance] Fix playlist api parse

---
 yt_dlp/extractor/radiofrance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 9c90c3f4e3..0d071ceef1 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -274,7 +274,7 @@ def _generate_playlist_entries(self, station, content_id, content_response):
         for page_num in itertools.count(2):
             for entry in content_response['items']:
                 yield self.url_result(
-                    f'https://www.radiofrance.fr/{entry["path"]}', url_transparent=True, **traverse_obj(entry, {
+                    f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
                         'title': 'title',
                         'description': 'standFirst',
                         'timestamp': ('publishedDate', {int_or_none}),
@@ -323,7 +323,7 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
             'id': 'eaf6ef81-a980-4f1c-a7d1-8a75ecd54b17',
             'display_id': 'le-billet-vert',
             'title': 'Le billet sciences',
-            'description': 'md5:eb1007b34b0c0a680daaa71525bbd4c1',
+            'description': 'md5:85d5ce8c488192e71904c551d595f4da',
             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
         },
         'playlist_mincount': 11,

From e01fab70415a82c5fd16681a7ec5fa279c47a3bf Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 14:44:48 +0100
Subject: [PATCH 10/16] [RadioFrance] fix profile pagination detection

---
 yt_dlp/extractor/radiofrance.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 0d071ceef1..ab3010b22b 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -392,7 +392,7 @@ class RadioFranceProfileIE(RadioFrancePlaylistBaseIE):
     _VALID_URL = rf'{RadioFranceBaseIE._VALID_URL_BASE}/personnes/(?P<id>[\w-]+)'
 
     _TESTS = [{
-        'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet?p=3',
+        'url': 'https://www.radiofrance.fr/personnes/thomas-pesquet',
         'info_dict': {
             'id': '86c62790-e481-11e2-9f7b-782bcb6744eb',
             'display_id': 'thomas-pesquet',
@@ -422,30 +422,24 @@ def _call_api(self, station, profile_id, cursor):
         webpage = self._download_webpage(url, profile_id, note=f'Downloading {profile_id} page {cursor}')
 
         resp = dict()
-
-        # On profile pages, the data is stored in a javascript array in the final <script>
-        # Each episode is stored as
-        # a[0] = { id: ... }; a[1] = [ id: ... ]; on page 2->
-        # If a page had a thumbnail, the a variable contains image data,
-        # and episode data is stored in b[0]...
         resp['items'] = []
-        podcastindex = 0
-        nextmatch = True
-        while nextmatch:
-            nextmatch = self._search_json(r'\w+\[' + str(podcastindex) + r'\]\s*=\s*', webpage, profile_id,
-                                          profile_id, transform_source=js_to_json, fatal=False, default=None)
-            podcastindex += 1
-            if nextmatch is not None:
-                resp['items'].append(nextmatch)
 
-        # There is more than one pagination key in the final <script>
-        # We should use pick the pagination object which is within a documents object
+        # get episode data from page
         pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
                                      transform_source=js_to_json)
-        lastPage = traverse_obj(pagedata, ('pagination', 'lastPage'))
+
+        # get thepage data
+        pagekey = pagedata['pagination']
+        hasMorePages = False
+        lastPage = int(self._search_regex(pagekey+'\.lastPage=(\d+);', webpage, profile_id, '0'))
         hasMorePages = cursor < lastPage
         resp['next'] = cursor + 1 if hasMorePages else None
 
+        # get episode data, note, not all will be A/V, so filter for 'expression'
+        for item in pagedata['items']:
+            if item['model']=='Expression':
+                resp['items'].append(item)
+
         resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,
                                              transform_source=js_to_json)
         # If the image data is stored separately rather than in the main content area

From 9d54ffc768b3beac61407c27fec5aa956aacb058 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 14:52:11 +0100
Subject: [PATCH 11/16] [RadioFrance] update tests for program grille

---
 yt_dlp/extractor/radiofrance.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index ab3010b22b..a293e9652d 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -431,7 +431,7 @@ def _call_api(self, station, profile_id, cursor):
         # get thepage data
         pagekey = pagedata['pagination']
         hasMorePages = False
-        lastPage = int(self._search_regex(pagekey+'\.lastPage=(\d+);', webpage, profile_id, '0'))
+        lastPage = int(self._search_regex(pagekey+r'\.lastPage=(\d+);', webpage, profile_id, '0'))
         hasMorePages = cursor < lastPage
         resp['next'] = cursor + 1 if hasMorePages else None
 
@@ -464,14 +464,14 @@ class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
             'id': 'franceinter-program-20230217',
             'upload_date': '20230217',
         },
-        'playlist_count': 25,
+        'playlist_count': 27,
     }, {
         'url': 'https://www.radiofrance.fr/franceculture/grille-programmes?date=01-02-2023',
         'info_dict': {
             'id': 'franceculture-program-20230201',
             'upload_date': '20230201',
         },
-        'playlist_count': 25,
+        'playlist_count': 29,
     }, {
         'url': 'https://www.radiofrance.fr/mouv/grille-programmes?date=19-03-2023',
         'info_dict': {
@@ -485,7 +485,7 @@ class RadioFranceProgramScheduleIE(RadioFranceBaseIE):
             'id': 'francemusique-program-20230318',
             'upload_date': '20230318',
         },
-        'playlist_count': 15,
+        'playlist_count': 16,
     }, {
         'url': 'https://www.radiofrance.fr/franceculture/grille-programmes',
         'only_matching': True,

From 3c5e3af7bce1a185d975e085a1ffdc167471e324 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 14:54:09 +0100
Subject: [PATCH 12/16] [RadioFrance] Remove defunct test

---
 yt_dlp/extractor/radiofrance.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index a293e9652d..e4d73b241f 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -18,18 +18,6 @@ class RadioFranceIE(InfoExtractor):
     _VALID_URL = r'https?://maison\.radiofrance\.fr/radiovisions/(?P<id>[^?#]+)'
     IE_NAME = 'radiofrance'
 
-    _TEST = {
-        'url': 'http://maison.radiofrance.fr/radiovisions/one-one',
-        'md5': 'bdbb28ace95ed0e04faab32ba3160daf',
-        'info_dict': {
-            'id': 'one-one',
-            'ext': 'ogg',
-            'title': 'One to one',
-            'description': "Plutôt que d'imaginer la radio de demain comme technologie ou comme création de contenu, je veux montrer que quelles que soient ses évolutions, j'ai l'intime conviction que la radio continuera d'être un grand média de proximité pour les auditeurs.",
-            'uploader': 'Thomas Hercouët',
-        },
-    }
-
     def _real_extract(self, url):
         m = self._match_valid_url(url)
         video_id = m.group('id')

From 0fb8bc11ed57a0f69d3433e369bbb3c872cb5521 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 15:04:48 +0100
Subject: [PATCH 13/16] [RadioFrance] Fix ruff issues

---
 yt_dlp/extractor/radiofrance.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index e4d73b241f..7d47ba686f 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -259,7 +259,7 @@ def _call_api(self, station, content_id, cursor):
         raise NotImplementedError('This method must be implemented by subclasses')
 
     def _generate_playlist_entries(self, station, content_id, content_response):
-        for page_num in itertools.count(2):
+        while True:
             for entry in content_response['items']:
                 yield self.url_result(
                     f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
@@ -269,11 +269,11 @@ def _generate_playlist_entries(self, station, content_id, content_response):
                         'thumbnail': ('visual', 'src'),
                     }))
 
-            if not content_response['next']:
+            if content_response['next']:
+                content_response = self._call_api(station, content_id, content_response['next'])
+            else:
                 break
 
-            content_response = self._call_api(station, content_id, content_response['next'])
-
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
         # If it is a podcast playlist, get the name of the station it is on
@@ -358,7 +358,7 @@ def _call_api(self, station, podcast_id, cursor):
         url = 'https://www.radiofrance.fr/' + station + '/podcasts/' + podcast_id + '?p=' + str(cursor)
         webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
 
-        resp = dict()
+        resp = {}
 
         # _search_json cannot parse the data as it contains javascript
         # Therefore, parse the episodes objects array separately
@@ -409,7 +409,7 @@ def _call_api(self, station, profile_id, cursor):
         url = 'https://www.radiofrance.fr/personnes/' + profile_id + '?p=' + str(cursor)
         webpage = self._download_webpage(url, profile_id, note=f'Downloading {profile_id} page {cursor}')
 
-        resp = dict()
+        resp = {}
         resp['items'] = []
 
         # get episode data from page
@@ -432,7 +432,7 @@ def _call_api(self, station, profile_id, cursor):
                                              transform_source=js_to_json)
         # If the image data is stored separately rather than in the main content area
         if resp['metadata']['visual'] and isinstance(resp['metadata']['visual'], str):
-            imagedata = dict()
+            imagedata = {}
             imagedata['src'] = self._og_search_thumbnail(webpage)
             resp['metadata']['visual'] = imagedata
 

From 9e3ac8951450b727d56ce5ed1eecef5d24414e75 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:28:49 +0100
Subject: [PATCH 14/16] [RadioFrance] support pages with embedded playback info

---
 yt_dlp/extractor/radiofrance.py | 62 +++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 7d47ba686f..f7ee6ad6ad 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -1,4 +1,3 @@
-import itertools
 import re
 
 from .common import InfoExtractor
@@ -261,19 +260,41 @@ def _call_api(self, station, content_id, cursor):
     def _generate_playlist_entries(self, station, content_id, content_response):
         while True:
             for entry in content_response['items']:
-                yield self.url_result(
-                    f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
-                        'title': 'title',
-                        'description': 'standFirst',
-                        'timestamp': ('publishedDate', {int_or_none}),
-                        'thumbnail': ('visual', 'src'),
-                    }))
+                if entry['link'] == '':
+                    yield entry
+                else:
+                    yield self.url_result(
+                        f'https://www.radiofrance.fr{entry["link"]}', url_transparent=True, **traverse_obj(entry, {
+                            'title': 'title',
+                            'description': 'standFirst',
+                            'timestamp': ('publishedDate', {int_or_none}),
+                            'thumbnail': ('visual', 'src'),
+                        }))
 
             if content_response['next']:
                 content_response = self._call_api(station, content_id, content_response['next'])
             else:
                 break
 
+    def _extract_embedded_episodes(self, item, webpage, content_id):
+        """Certain episdoes data are embedded directly in the page, use these if the link is missing"""
+        links = item['playerInfo']['media']['sources']
+        item['formats'] = []
+        for linkkey in links:
+            url = self._search_regex(linkkey+r'\.url="([^"]+)";', webpage, content_id)
+            dur = int(self._search_regex(linkkey+r'\.duration=(\d+);', webpage, content_id))
+            preset = self._search_json(linkkey+r'\.preset=', webpage, content_id, content_id, contains_pattern=r'\{.+\}', transform_source=js_to_json)
+            item['formats'].append({
+                'format_id': preset['id'],
+                'url': url,
+                'vcodec': 'none',
+                'acodec': preset['encoding'],
+                'quality': preset['bitrate'],
+                'duration': dur
+            })
+            item['duration'] = dur
+        return item
+
     def _real_extract(self, url):
         playlist_id = self._match_id(url)
         # If it is a podcast playlist, get the name of the station it is on
@@ -343,6 +364,16 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
         },
         'playlist_mincount': 321,
+    }, {
+        'url': 'http://www.radiofrance.fr/franceculture/podcasts/serie-les-aventures-de-tintin-les-cigares-du-pharaon',
+        'info_dict': {
+            'id': '01b096c6-e7f8-49c4-8319-dd399221885b',
+            'display_id': 'serie-les-aventures-de-tintin-les-cigares-du-pharaon',
+            'title': 'Les Cigares du Pharaon\xa0: les Aventures de Tintin',
+            'description': 'md5:1c5b6d010b2aaeb0d90b2c233b5f7b15',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)',
+        },
+        'playlist_count': 5
     }, {
         'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
         'only_matching': True,
@@ -359,12 +390,19 @@ def _call_api(self, station, podcast_id, cursor):
         webpage = self._download_webpage(url, podcast_id, note=f'Downloading {podcast_id} page {cursor}')
 
         resp = {}
+        resp['items'] = []
 
         # _search_json cannot parse the data as it contains javascript
         # Therefore, parse the episodes objects array separately
-        resp['items'] = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
+        itemlist = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
                                           contains_pattern=r'\[.+\]', transform_source=js_to_json)
 
+        for item in itemlist:
+            if item['model'] == 'Expression':
+                if item['link'] == '':
+                    item = self._extract_embedded_episodes(item, webpage, podcast_id)
+                resp['items'].append(item)
+
         # the pagination data is stored in a javascript object 'a'
         lastPage = int(re.search(r'a\.lastPage\s*=\s*(\d+);', webpage).group(1))
         hasMorePages = cursor < lastPage
@@ -416,7 +454,7 @@ def _call_api(self, station, profile_id, cursor):
         pagedata = self._search_json(r'documents\s*:\s*', webpage, profile_id, profile_id,
                                      transform_source=js_to_json)
 
-        # get thepage data
+        # get the page data
         pagekey = pagedata['pagination']
         hasMorePages = False
         lastPage = int(self._search_regex(pagekey+r'\.lastPage=(\d+);', webpage, profile_id, '0'))
@@ -425,7 +463,9 @@ def _call_api(self, station, profile_id, cursor):
 
         # get episode data, note, not all will be A/V, so filter for 'expression'
         for item in pagedata['items']:
-            if item['model']=='Expression':
+            if item['model'] == 'Expression':
+                if item.link == '':
+                    item = self._extract_embedded_episodes(item, webpage, profile_id)
                 resp['items'].append(item)
 
         resp['metadata'] = self._search_json(r'content:\s*', webpage, profile_id, profile_id,

From dcd0ee3ec3ecdf9afee7a177bd6e33cfa5841543 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:30:19 +0100
Subject: [PATCH 15/16] [RadioFrance] ruff trailing commas

---
 yt_dlp/extractor/radiofrance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index f7ee6ad6ad..0a95a8c928 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -290,7 +290,7 @@ def _extract_embedded_episodes(self, item, webpage, content_id):
                 'vcodec': 'none',
                 'acodec': preset['encoding'],
                 'quality': preset['bitrate'],
-                'duration': dur
+                'duration': dur,
             })
             item['duration'] = dur
         return item
@@ -373,7 +373,7 @@ class RadioFrancePodcastIE(RadioFrancePlaylistBaseIE):
             'description': 'md5:1c5b6d010b2aaeb0d90b2c233b5f7b15',
             'thumbnail': r're:^https?://.*\.(?:jpg|png)',
         },
-        'playlist_count': 5
+        'playlist_count': 5,
     }, {
         'url': 'https://www.radiofrance.fr/franceinter/podcasts/le-7-9',
         'only_matching': True,

From dda6f7b5638b46b604f45bb50bf0368fa0f20b03 Mon Sep 17 00:00:00 2001
From: lonm <LonMcGregor@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:35:28 +0100
Subject: [PATCH 16/16] [RadioFrance] run autopep

---
 yt_dlp/extractor/radiofrance.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/yt_dlp/extractor/radiofrance.py b/yt_dlp/extractor/radiofrance.py
index 0a95a8c928..31851adb9d 100644
--- a/yt_dlp/extractor/radiofrance.py
+++ b/yt_dlp/extractor/radiofrance.py
@@ -281,9 +281,9 @@ def _extract_embedded_episodes(self, item, webpage, content_id):
         links = item['playerInfo']['media']['sources']
         item['formats'] = []
         for linkkey in links:
-            url = self._search_regex(linkkey+r'\.url="([^"]+)";', webpage, content_id)
-            dur = int(self._search_regex(linkkey+r'\.duration=(\d+);', webpage, content_id))
-            preset = self._search_json(linkkey+r'\.preset=', webpage, content_id, content_id, contains_pattern=r'\{.+\}', transform_source=js_to_json)
+            url = self._search_regex(linkkey + r'\.url="([^"]+)";', webpage, content_id)
+            dur = int(self._search_regex(linkkey + r'\.duration=(\d+);', webpage, content_id))
+            preset = self._search_json(linkkey + r'\.preset=', webpage, content_id, content_id, contains_pattern=r'\{.+\}', transform_source=js_to_json)
             item['formats'].append({
                 'format_id': preset['id'],
                 'url': url,
@@ -395,7 +395,7 @@ def _call_api(self, station, podcast_id, cursor):
         # _search_json cannot parse the data as it contains javascript
         # Therefore, parse the episodes objects array separately
         itemlist = self._search_json(r'a.items\s*=\s*', webpage, podcast_id, podcast_id,
-                                          contains_pattern=r'\[.+\]', transform_source=js_to_json)
+                                     contains_pattern=r'\[.+\]', transform_source=js_to_json)
 
         for item in itemlist:
             if item['model'] == 'Expression':
@@ -457,7 +457,7 @@ def _call_api(self, station, profile_id, cursor):
         # get the page data
         pagekey = pagedata['pagination']
         hasMorePages = False
-        lastPage = int(self._search_regex(pagekey+r'\.lastPage=(\d+);', webpage, profile_id, '0'))
+        lastPage = int(self._search_regex(pagekey + r'\.lastPage=(\d+);', webpage, profile_id, '0'))
         hasMorePages = cursor < lastPage
         resp['next'] = cursor + 1 if hasMorePages else None